From a34375ef9e65340a138fc0be287de5c940d260fc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Sep 2014 09:51:30 +0900 Subject: percpu-refcount: add @gfp to percpu_ref_init() Percpu allocator now supports allocation mask. Add @gfp to percpu_ref_init() so that !GFP_KERNEL allocation masks can be used with percpu_refs too. This patch doesn't make any functional difference. v2: blk-mq conversion was missing. Updated. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Benjamin LaHaise Cc: Li Zefan Cc: Nicholas A. Bellinger Cc: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5189cb1e478a..702df07b980d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1776,7 +1776,8 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; - if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) + if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, + GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); -- cgit v1.2.3 From 9eca80461a45177e456219a9cd944c27675d6512 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:07:33 -0400 Subject: Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe" This reverts commit 0a30288da1aec914e158c2d7a3482a85f632750f, which was a temporary fix for SCSI blk-mq stall issue. The following patches will fix the issue properly by introducing atomic mode to percpu_ref. Signed-off-by: Tejun Heo Cc: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig --- block/blk-mq.c | 11 +---------- include/linux/percpu-refcount.h | 1 - lib/percpu-refcount.c | 16 ---------------- 3 files changed, 1 insertion(+), 27 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 255d79c14dc1..44a78ae3f899 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q) spin_unlock_irq(q->queue_lock); if (freeze) { - /* - * XXX: Temporary kludge to work around SCSI blk-mq stall. - * SCSI synchronously creates and destroys many queues - * back-to-back during probe leading to lengthy stalls. - * This will be fixed by keeping ->mq_usage_counter in - * atomic mode until genhd registration, but, for now, - * let's work around using expedited synchronization. - */ - __percpu_ref_kill_expedited(&q->mq_usage_counter); - + percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 11b38ceca7e2..5df6784bd9d2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -72,7 +72,6 @@ void percpu_ref_reinit(struct percpu_ref *ref); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); -void __percpu_ref_kill_expedited(struct percpu_ref *ref); /** * percpu_ref_kill - drop the initial ref diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index c6c31e2829b1..559ee0b20318 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -189,19 +189,3 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); } EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); - -/* - * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by - * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 - * devel cycle. Do not use anywhere else. - */ -void __percpu_ref_kill_expedited(struct percpu_ref *ref) -{ - WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, - "percpu_ref_kill() called more than once on %pf!", - ref->release); - - ref->pcpu_count_ptr |= PCPU_REF_DEAD; - synchronize_sched_expedited(); - percpu_ref_kill_rcu(&ref->rcu); -} -- cgit v1.2.3 From 2aad2a86f6685c10360ec8a5a55eb9ab7059cb72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: percpu_ref: add PERCPU_REF_INIT_* flags With the recent addition of percpu_ref_reinit(), percpu_ref now can be used as a persistent switch which can be turned on and off repeatedly where turning off maps to killing the ref and waiting for it to drain; however, there currently isn't a way to initialize a percpu_ref in its off (killed and drained) state, which can be inconvenient for certain persistent switch use cases. Similarly, percpu_ref_switch_to_atomic/percpu() allow dynamic selection of operation mode; however, currently a newly initialized percpu_ref is always in percpu mode making it impossible to avoid the latency overhead of switching to atomic mode. This patch adds @flags to percpu_ref_init() and implements the following flags. * PERCPU_REF_INIT_ATOMIC : start ref in atomic mode * PERCPU_REF_INIT_DEAD : start ref killed and drained These flags should be able to serve the above two use cases. v2: target_core_tpg.c conversion was missing. Fixed. Signed-off-by: Tejun Heo Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Christoph Hellwig Cc: Johannes Weiner --- block/blk-mq.c | 2 +- drivers/target/target_core_tpg.c | 2 +- fs/aio.c | 4 ++-- include/linux/percpu-refcount.h | 18 +++++++++++++++++- kernel/cgroup.c | 7 ++++--- lib/percpu-refcount.c | 23 ++++++++++++++++++----- 6 files changed, 43 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 44a78ae3f899..d85fe01c44ef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1796,7 +1796,7 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) goto err_hctxs; if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - GFP_KERNEL)) + 0, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 4ab6da338585..be783f717f19 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -819,7 +819,7 @@ int core_tpg_add_lun( { int ret; - ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, + ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, GFP_KERNEL); if (ret < 0) return ret; diff --git a/fs/aio.c b/fs/aio.c index 8d217ed04e6e..84a751005f5b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users, GFP_KERNEL)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, GFP_KERNEL)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index cd7e20f0fe47..b0293f268cd2 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -63,6 +63,21 @@ enum { __PERCPU_REF_FLAG_BITS = 2, }; +/* @flags for percpu_ref_init() */ +enum { + /* + * Start w/ ref == 1 in atomic mode. Can be switched to percpu + * operation using percpu_ref_switch_to_percpu(). + */ + PERCPU_REF_INIT_ATOMIC = 1 << 0, + + /* + * Start dead w/ ref == 0 in atomic mode. Must be revived with + * percpu_ref_reinit() before used. Implies INIT_ATOMIC. + */ + PERCPU_REF_INIT_DEAD = 1 << 1, +}; + struct percpu_ref { atomic_long_t count; /* @@ -76,7 +91,8 @@ struct percpu_ref { }; int __must_check percpu_ref_init(struct percpu_ref *ref, - percpu_ref_func_t *release, gfp_t gfp); + percpu_ref_func_t *release, unsigned int flags, + gfp_t gfp); void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a99d504294de..753df01a9831 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1634,7 +1634,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) goto out; root_cgrp->id = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, + GFP_KERNEL); if (ret) goto out; @@ -4510,7 +4511,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, init_and_link_css(css, ss, cgrp); - err = percpu_ref_init(&css->refcnt, css_release, GFP_KERNEL); + err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); if (err) goto err_free_css; @@ -4583,7 +4584,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, goto out_unlock; } - ret = percpu_ref_init(&cgrp->self.refcnt, css_release, GFP_KERNEL); + ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) goto out_free_cgrp; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 5a6d43baccc5..ed280fb1e5b5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -45,27 +45,40 @@ static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) * percpu_ref_init - initialize a percpu refcount * @ref: percpu_ref to initialize * @release: function which will be called when refcount hits 0 + * @flags: PERCPU_REF_INIT_* flags * @gfp: allocation mask to use * - * Initializes the refcount in single atomic counter mode with a refcount of 1; - * analagous to atomic_long_set(ref, 1). + * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a + * refcount of 1; analagous to atomic_long_set(ref, 1). See the + * definitions of PERCPU_REF_INIT_* flags for flag behaviors. * * Note that @release must not sleep - it may potentially be called from RCU * callback context by percpu_ref_kill(). */ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, - gfp_t gfp) + unsigned int flags, gfp_t gfp) { size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, __alignof__(unsigned long)); - - atomic_long_set(&ref->count, 1 + PERCPU_COUNT_BIAS); + unsigned long start_count = 0; ref->percpu_count_ptr = (unsigned long) __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); if (!ref->percpu_count_ptr) return -ENOMEM; + if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) + ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; + else + start_count += PERCPU_COUNT_BIAS; + + if (flags & PERCPU_REF_INIT_DEAD) + ref->percpu_count_ptr |= __PERCPU_REF_DEAD; + else + start_count++; + + atomic_long_set(&ref->count, start_count); + ref->release = release; return 0; } -- cgit v1.2.3 From 17497acbdce9506fd6a75115dee4ab80c3cc5ee5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Sep 2014 13:31:50 -0400 Subject: blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode blk-mq uses percpu_ref for its usage counter which tracks the number of in-flight commands and used to synchronously drain the queue on freeze. percpu_ref shutdown takes measureable wallclock time as it involves a sched RCU grace period. This means that draining a blk-mq takes measureable wallclock time. One would think that this shouldn't matter as queue shutdown should be a rare event which takes place asynchronously w.r.t. userland. Unfortunately, SCSI probing involves synchronously setting up and then tearing down a lot of request_queues back-to-back for non-existent LUNs. This means that SCSI probing may take above ten seconds when scsi-mq is used. [ 0.949892] scsi host0: Virtio SCSI HBA [ 1.007864] scsi 0:0:0:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.021299] scsi 0:0:1:0: Direct-Access QEMU QEMU HARDDISK 1.1. PQ: 0 ANSI: 5 [ 1.520356] tsc: Refined TSC clocksource calibration: 2491.910 MHz [ 16.186549] sd 0:0:0:0: Attached scsi generic sg0 type 0 [ 16.190478] sd 0:0:1:0: Attached scsi generic sg1 type 0 [ 16.194099] osd: LOADED open-osd 0.2.1 [ 16.203202] sd 0:0:0:0: [sda] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.208478] sd 0:0:0:0: [sda] Write Protect is off [ 16.211439] sd 0:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA [ 16.218771] sd 0:0:1:0: [sdb] 31457280 512-byte logical blocks: (16.1 GB/15.0 GiB) [ 16.223264] sd 0:0:1:0: [sdb] Write Protect is off [ 16.225682] sd 0:0:1:0: [sdb] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA This is also the reason why request_queues start in bypass mode which is ended on blk_register_queue() as shutting down a fully functional queue also involves a RCU grace period and the queues for non-existent SCSI devices never reach registration. blk-mq basically needs to do the same thing - start the mq in a degraded mode which is faster to shut down and then make it fully functional only after the queue reaches registration. percpu_ref recently grew facilities to force atomic operation until explicitly switched to percpu mode, which can be used for this purpose. This patch makes blk-mq initialize q->mq_usage_counter in atomic mode and switch it to percpu mode only once blk_register_queue() is reached. Note that this issue was previously worked around by 0a30288da1ae ("blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe") for v3.17. The temp fix was reverted in preparation of adding persistent atomic mode to percpu_ref by 9eca80461a45 ("Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe""). This patch and the prerequisite percpu_ref changes will be merged during v3.18 devel cycle. Signed-off-by: Tejun Heo Reported-by: Christoph Hellwig Link: http://lkml.kernel.org/g/20140919113815.GA10791@lst.de Fixes: add703fda981 ("blk-mq: use percpu_ref for mq usage count") Reviewed-by: Kent Overstreet Cc: Jens Axboe Cc: Johannes Weiner --- block/blk-mq-sysfs.c | 6 ++++++ block/blk-mq.c | 6 +++++- block/blk-sysfs.c | 11 +++++++++-- include/linux/blk-mq.h | 1 + 4 files changed, 21 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ed5217867555..371d8800b48a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } +/* see blk_register_queue() */ +void blk_mq_finish_init(struct request_queue *q) +{ + percpu_ref_switch_to_percpu(&q->mq_usage_counter); +} + int blk_mq_register_disk(struct gendisk *disk) { struct device *dev = disk_to_dev(disk); diff --git a/block/blk-mq.c b/block/blk-mq.c index d85fe01c44ef..38f4a165640d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1795,8 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!q) goto err_hctxs; + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, - 0, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto err_map; setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84ce7bf..521ae9089c50 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk) return -ENXIO; /* - * Initialization must be complete by now. Finish the initial - * bypass from queue allocation. + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. */ if (!blk_queue_init_done(q)) { queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); blk_queue_bypass_end(q); + if (q->mq_ops) + blk_mq_finish_init(q); } ret = blk_trace_init_sysfs(dev); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f274fcd..c13a0c09faea 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -140,6 +140,7 @@ enum { }; struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +void blk_mq_finish_init(struct request_queue *q); int blk_mq_register_disk(struct gendisk *); void blk_mq_unregister_disk(struct gendisk *); -- cgit v1.2.3