From c3077b5d97a39223a2d4b95a21ccff660836170f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:41 +0200 Subject: blk-mq: merge blk-softirq.c into blk-mq.c __blk_complete_request is only called from the blk-mq code, and duplicates a lot of code from blk-mq.c. Move it there to prepare for better code sharing and simplifications. Signed-off-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/Makefile | 2 +- block/blk-mq.c | 135 +++++++++++++++++++++++++++++++++++++++++++++ block/blk-softirq.c | 156 ---------------------------------------------------- 3 files changed, 136 insertions(+), 157 deletions(-) delete mode 100644 block/blk-softirq.c (limited to 'block') diff --git a/block/Makefile b/block/Makefile index 78719169fb2a..8d841f5f986f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-exec.o blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o diff --git a/block/blk-mq.c b/block/blk-mq.c index a9aa6d1e44cf..60febbf6f8d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -41,6 +41,8 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -574,6 +576,130 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static __latent_entropy void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = this_cpu_ptr(&blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); + rq->q->mq_ops->complete(rq); + } +} + +#ifdef CONFIG_SMP +static void trigger_softirq(void *data) +{ + struct request *rq = data; + struct list_head *list; + + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); +} + +/* + * Setup and invoke a run of 'trigger_softirq' on the given cpu. + */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + if (cpu_online(cpu)) { + call_single_data_t *data = &rq->csd; + + data->func = trigger_softirq; + data->info = rq; + data->flags = 0; + + smp_call_function_single_async(cpu, data); + return 0; + } + + return 1; +} +#else /* CONFIG_SMP */ +static int raise_blk_irq(int cpu, struct request *rq) +{ + return 1; +} +#endif + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + + return 0; +} + +static void __blk_complete_request(struct request *req) +{ + struct request_queue *q = req->q; + int cpu, ccpu = req->mq_ctx->cpu; + unsigned long flags; + bool shared = false; + + BUG_ON(!q->mq_ops->complete); + + local_irq_save(flags); + cpu = smp_processor_id(); + + /* + * Select completion CPU + */ + if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ccpu); + } else + ccpu = cpu; + + /* + * If current CPU and requested CPU share a cache, run the softirq on + * the current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ + if (ccpu == cpu || shared) { + struct list_head *list; +do_local: + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&req->ipi_list, list); + + /* + * if the list only contains our just added request, + * signal a raise of the softirq. If there are already + * entries there, someone already raised the irq but it + * hasn't run yet. + */ + if (list->next == &req->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + } else if (raise_blk_irq(ccpu, req)) + goto do_local; + + local_irq_restore(flags); +} + static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; @@ -3760,6 +3886,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", diff --git a/block/blk-softirq.c b/block/blk-softirq.c deleted file mode 100644 index 6e7ec87d49fa..000000000000 --- a/block/blk-softirq.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to softirq rq completions - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "blk.h" - -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); - rq->q->mq_ops->complete(rq); - } -} - -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) -{ - struct request *rq = data; - struct list_head *list; - - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - -static int blk_softirq_cpu_dead(unsigned int cpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - - return 0; -} - -void __blk_complete_request(struct request *req) -{ - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. - */ - if (ccpu == cpu || shared) { - struct list_head *list; -do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; - - local_irq_restore(flags); -} - -static __init int blk_softirq_init(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, - "block/softirq:dead", NULL, - blk_softirq_cpu_dead); - return 0; -} -subsys_initcall(blk_softirq_init); -- cgit v1.2.3 From 115243f5534c7b3980cc946e00f79740fdc0e068 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:42 +0200 Subject: blk-mq: factor out a helper to reise the block softirq Add a helper to deduplicate the logic that raises the block softirq. Signed-off-by: Christoph Hellwig Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-mq.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 60febbf6f8d9..a261e145ddfb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -598,19 +598,27 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) } } -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) +static void blk_mq_trigger_softirq(struct request *rq) { - struct request *rq = data; - struct list_head *list; + struct list_head *list = this_cpu_ptr(&blk_cpu_done); - list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); + /* + * If the list only contains our just added request, signal a raise of + * the softirq. If there are already entries there, someone already + * raised the irq but it hasn't run yet. + */ if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); } +#ifdef CONFIG_SMP +static void trigger_softirq(void *data) +{ + blk_mq_trigger_softirq(data); +} + /* * Setup and invoke a run of 'trigger_softirq' on the given cpu. */ @@ -681,19 +689,8 @@ static void __blk_complete_request(struct request *req) * avoids IPI sending from current CPU to the first CPU of a group. */ if (ccpu == cpu || shared) { - struct list_head *list; do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); + blk_mq_trigger_softirq(req); } else if (raise_blk_irq(ccpu, req)) goto do_local; -- cgit v1.2.3 From dea6f3993812c82b4dd5f61acd41c55a311a445f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:43 +0200 Subject: blk-mq: remove raise_blk_irq By open coding raise_blk_irq in the only caller, and replacing the ifdef CONFIG_SMP with an IS_ENABLED check the flow in the caller can be significantly simplified. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a261e145ddfb..ada55521601f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -613,37 +613,11 @@ static void blk_mq_trigger_softirq(struct request *rq) raise_softirq_irqoff(BLOCK_SOFTIRQ); } -#ifdef CONFIG_SMP static void trigger_softirq(void *data) { blk_mq_trigger_softirq(data); } -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - static int blk_softirq_cpu_dead(unsigned int cpu) { /* @@ -688,11 +662,17 @@ static void __blk_complete_request(struct request *req) * support multiple interrupts, so current CPU is unique actually. This * avoids IPI sending from current CPU to the first CPU of a group. */ - if (ccpu == cpu || shared) { -do_local: + if (IS_ENABLED(CONFIG_SMP) && + ccpu != cpu && !shared && cpu_online(ccpu)) { + call_single_data_t *data = &req->csd; + + data->func = trigger_softirq; + data->info = req; + data->flags = 0; + smp_call_function_single_async(cpu, data); + } else { blk_mq_trigger_softirq(req); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; + } local_irq_restore(flags); } -- cgit v1.2.3 From 6aab1da603e731383b342dbe612f92cd222fe56b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:44 +0200 Subject: blk-mq: complete polled requests directly Even for single queue devices there is no point in offloading a polled completion to the softirq, given that blk_mq_force_complete_rq is called from the polling thread in that case and thus there are no starvation issues. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ada55521601f..ea083f58d9da 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -703,6 +703,16 @@ void blk_mq_force_complete_rq(struct request *rq) int cpu; WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if (rq->cmd_flags & REQ_HIPRI) { + q->mq_ops->complete(rq); + return; + } + /* * Most of single queue controllers, there is only one irq vector * for handling IO completion, and the only irq's affinity is set @@ -717,12 +727,7 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - /* - * For a polled request, always complete locallly, it's pointless - * to redirect the completion. - */ - if ((rq->cmd_flags & REQ_HIPRI) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { q->mq_ops->complete(rq); return; } -- cgit v1.2.3 From d6cc464cc58424e137eca5e0a53226291044f5d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:45 +0200 Subject: blk-mq: short cut the IPI path in blk_mq_force_complete_rq for !SMP Let the compile optimize out the entire IPI path, given that we are obviously not going to use it. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ea083f58d9da..eef2f3c7f402 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -727,7 +727,8 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - if (!test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { q->mq_ops->complete(rq); return; } -- cgit v1.2.3 From d391a7a399e46315a8adc65eb8fb5d9123b91700 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:46 +0200 Subject: blk-mq: merge the softirq vs non-softirq IPI logic Both the softirq path for single queue devices and the multi-queue completion handler share the same logic to figure out if we need an IPI for the completion and eventually issue it. Merge the two versions into a single unified code path. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 85 ++++++++++++++-------------------------------------------- 1 file changed, 20 insertions(+), 65 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index eef2f3c7f402..ce772ab19188 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -600,8 +600,11 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) static void blk_mq_trigger_softirq(struct request *rq) { - struct list_head *list = this_cpu_ptr(&blk_cpu_done); + struct list_head *list; + unsigned long flags; + local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); list_add_tail(&rq->ipi_list, list); /* @@ -611,11 +614,7 @@ static void blk_mq_trigger_softirq(struct request *rq) */ if (list->next == &rq->ipi_list) raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -static void trigger_softirq(void *data) -{ - blk_mq_trigger_softirq(data); + local_irq_restore(flags); } static int blk_softirq_cpu_dead(unsigned int cpu) @@ -633,56 +632,26 @@ static int blk_softirq_cpu_dead(unsigned int cpu) return 0; } -static void __blk_complete_request(struct request *req) +static void __blk_mq_complete_request(struct request *rq) { - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. + * For most of single queue controllers, there is only one irq vector + * for handling I/O completion, and the only irq's affinity is set + * to all possible CPUs. On most of ARCHs, this affinity means the irq + * is handled on one specific CPU. + * + * So complete I/O requests in softirq context in case of single queue + * devices to avoid degrading I/O performance due to irqsoff latency. */ - if (IS_ENABLED(CONFIG_SMP) && - ccpu != cpu && !shared && cpu_online(ccpu)) { - call_single_data_t *data = &req->csd; - - data->func = trigger_softirq; - data->info = req; - data->flags = 0; - smp_call_function_single_async(cpu, data); - } else { - blk_mq_trigger_softirq(req); - } - - local_irq_restore(flags); + if (rq->q->nr_hw_queues == 1) + blk_mq_trigger_softirq(rq); + else + rq->q->mq_ops->complete(rq); } static void __blk_mq_complete_request_remote(void *data) { - struct request *rq = data; - struct request_queue *q = rq->q; - - q->mq_ops->complete(rq); + __blk_mq_complete_request(data); } /** @@ -713,23 +682,9 @@ void blk_mq_force_complete_rq(struct request *rq) return; } - /* - * Most of single queue controllers, there is only one irq vector - * for handling IO completion, and the only irq's affinity is set - * as all possible CPUs. On most of ARCHs, this affinity means the - * irq is handled on one specific CPU. - * - * So complete IO reqeust in softirq context in case of single queue - * for not degrading IO performance by irqsoff latency. - */ - if (q->nr_hw_queues == 1) { - __blk_complete_request(rq); - return; - } - if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - q->mq_ops->complete(rq); + __blk_mq_complete_request(rq); return; } @@ -743,7 +698,7 @@ void blk_mq_force_complete_rq(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { - q->mq_ops->complete(rq); + __blk_mq_complete_request(rq); } put_cpu(); } -- cgit v1.2.3 From 15f73f5b3e5958f2d169fe13c420eeeeae07bbf2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:47 +0200 Subject: blk-mq: move failure injection out of blk_mq_complete_request Move the call to blk_should_fake_timeout out of blk_mq_complete_request and into the drivers, skipping call sites that are obvious error handlers, and remove the now superflous blk_mq_force_complete_rq helper. This ensures we don't keep injecting errors into completions that just terminate the Linux request after the hardware has been reset or the command has been aborted. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 34 +++++++--------------------------- block/blk-timeout.c | 6 ++---- block/blk.h | 9 --------- block/bsg-lib.c | 5 ++++- drivers/block/loop.c | 6 ++++-- drivers/block/mtip32xx/mtip32xx.c | 3 ++- drivers/block/nbd.c | 5 ++++- drivers/block/null_blk_main.c | 5 +++-- drivers/block/skd_main.c | 9 ++++++--- drivers/block/virtio_blk.c | 3 ++- drivers/block/xen-blkfront.c | 3 ++- drivers/md/dm-rq.c | 3 ++- drivers/mmc/core/block.c | 8 ++++---- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/nvme.h | 3 ++- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/scm_blk.c | 3 ++- drivers/scsi/scsi_lib.c | 12 +++--------- include/linux/blk-mq.h | 12 ++++++++++-- 19 files changed, 61 insertions(+), 72 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index ce772ab19188..3f4f227cf830 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -655,16 +655,13 @@ static void __blk_mq_complete_request_remote(void *data) } /** - * blk_mq_force_complete_rq() - Force complete the request, bypassing any error - * injection that could drop the completion. - * @rq: Request to be force completed + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed * - * Drivers should use blk_mq_complete_request() to complete requests in their - * normal IO path. For timeout error recovery, drivers may call this forced - * completion routine after they've reclaimed timed out requests to bypass - * potentially subsequent fake timeouts. - */ -void blk_mq_force_complete_rq(struct request *rq) + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; struct request_queue *q = rq->q; @@ -702,7 +699,7 @@ void blk_mq_force_complete_rq(struct request *rq) } put_cpu(); } -EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); +EXPORT_SYMBOL(blk_mq_complete_request); static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) __releases(hctx->srcu) @@ -724,23 +721,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) *srcu_idx = srcu_read_lock(hctx->srcu); } -/** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions. - * The actual completion happens out-of-order, through a IPI handler. - **/ -bool blk_mq_complete_request(struct request *rq) -{ - if (unlikely(blk_should_fake_timeout(rq->q))) - return false; - blk_mq_force_complete_rq(rq); - return true; -} -EXPORT_SYMBOL(blk_mq_complete_request); - /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8aa68fae96ad..3a1ac6434758 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str) } __setup("fail_io_timeout=", setup_fail_io_timeout); -int blk_should_fake_timeout(struct request_queue *q) +bool __blk_should_fake_timeout(struct request_queue *q) { - if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) - return 0; - return should_fail(&fail_io_timeout, 1); } +EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { diff --git a/block/blk.h b/block/blk.h index b5d1f0fc6547..8ba4a5e4fe07 100644 --- a/block/blk.h +++ b/block/blk.h @@ -223,18 +223,9 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); - -#ifdef CONFIG_FAIL_IO_TIMEOUT -int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -#else -static inline int blk_should_fake_timeout(struct request_queue *q) -{ - return 0; -} -#endif void __blk_queue_split(struct request_queue *q, struct bio **bio, unsigned int *nr_segs); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 6cbb7926534c..fb7b347f8010 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -181,9 +181,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get); void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len) { + struct request *rq = blk_mq_rq_from_pdu(job); + job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_mq_complete_request(blk_mq_rq_from_pdu(job)); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } EXPORT_SYMBOL_GPL(bsg_job_done); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 475e1a738560..4acae248790c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -509,7 +509,8 @@ static void lo_rw_aio_do_completion(struct loop_cmd *cmd) return; kfree(cmd->bvec); cmd->bvec = NULL; - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) @@ -2048,7 +2049,8 @@ static void loop_handle_cmd(struct loop_cmd *cmd) cmd->ret = ret; else cmd->ret = ret ? -EIO : 0; - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } } diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index f6bafa9a68b9..153e2cdecb4d 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -492,7 +492,8 @@ static void mtip_complete_command(struct mtip_cmd *cmd, blk_status_t status) struct request *req = blk_mq_rq_from_pdu(cmd); cmd->status = status; - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); } /* diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 43cff01a5a67..01794cd2b6ca 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -784,6 +784,7 @@ static void recv_work(struct work_struct *work) struct nbd_device *nbd = args->nbd; struct nbd_config *config = nbd->config; struct nbd_cmd *cmd; + struct request *rq; while (1) { cmd = nbd_read_stat(nbd, args->index); @@ -796,7 +797,9 @@ static void recv_work(struct work_struct *work) break; } - blk_mq_complete_request(blk_mq_rq_from_pdu(cmd)); + rq = blk_mq_rq_from_pdu(cmd); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } atomic_dec(&config->recv_threads); wake_up(&config->recv_wq); diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 87b31f9ca362..82259242b9b5 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1283,7 +1283,8 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) case NULL_IRQ_SOFTIRQ: switch (cmd->nq->dev->queue_mode) { case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); + if (likely(!blk_should_fake_timeout(cmd->rq->q))) + blk_mq_complete_request(cmd->rq); break; case NULL_Q_BIO: /* @@ -1423,7 +1424,7 @@ static bool should_requeue_request(struct request *rq) static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) { pr_info("rq %p timed out\n", rq); - blk_mq_force_complete_rq(rq); + blk_mq_complete_request(rq); return BLK_EH_DONE; } diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 51569c199a6c..3a476dc1d14f 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1417,7 +1417,8 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_REPORT_GOOD: case SKD_CHECK_STATUS_REPORT_SMART_ALERT: skreq->status = BLK_STS_OK; - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); break; case SKD_CHECK_STATUS_BUSY_IMMINENT: @@ -1440,7 +1441,8 @@ static void skd_resolve_req_exception(struct skd_device *skdev, case SKD_CHECK_STATUS_REPORT_ERROR: default: skreq->status = BLK_STS_IOERR; - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); break; } } @@ -1560,7 +1562,8 @@ static int skd_isr_completion_posted(struct skd_device *skdev, */ if (likely(cmp_status == SAM_STAT_GOOD)) { skreq->status = BLK_STS_OK; - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } else { skd_resolve_req_exception(skdev, skreq, rq); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 9d21bf0f155e..741804bd8a14 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -171,7 +171,8 @@ static void virtblk_done(struct virtqueue *vq) while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) { struct request *req = blk_mq_rq_from_pdu(vbr); - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); req_done = true; } if (unlikely(virtqueue_is_broken(vq))) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 3b889ea950c2..3bb3dd8da9b0 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1655,7 +1655,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) BUG(); } - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); } rinfo->ring.rsp_cons = i; diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index f60c02512121..5aec1cd09348 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -288,7 +288,8 @@ static void dm_complete_request(struct request *rq, blk_status_t error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_mq_complete_request(rq); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } /* diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 7896952de1ac..4791c82f8f7c 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1446,7 +1446,7 @@ static void mmc_blk_cqe_req_done(struct mmc_request *mrq) */ if (mq->in_recovery) mmc_blk_cqe_complete_rq(mq, req); - else + else if (likely(!blk_should_fake_timeout(req->q))) blk_mq_complete_request(req); } @@ -1926,7 +1926,7 @@ static void mmc_blk_hsq_req_done(struct mmc_request *mrq) */ if (mq->in_recovery) mmc_blk_cqe_complete_rq(mq, req); - else + else if (likely(!blk_should_fake_timeout(req->q))) blk_mq_complete_request(req); } @@ -1936,7 +1936,7 @@ void mmc_blk_mq_complete(struct request *req) if (mq->use_cqe) mmc_blk_cqe_complete_rq(mq, req); - else + else if (likely(!blk_should_fake_timeout(req->q))) mmc_blk_mq_complete_rq(mq, req); } @@ -1988,7 +1988,7 @@ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) */ if (mq->in_recovery) mmc_blk_mq_complete_rq(mq, req); - else + else if (likely(!blk_should_fake_timeout(req->q))) blk_mq_complete_request(req); mmc_blk_mq_dec_in_flight(mq, req); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2c5bc4fb702..6810c8812aed 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -304,7 +304,7 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) return true; nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD; - blk_mq_force_complete_rq(req); + blk_mq_complete_request(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index c0f4226d3299..034613205701 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -481,7 +481,8 @@ static inline void nvme_end_request(struct request *req, __le16 status, rq->result = result; /* inject error when permitted by fault injection framework */ nvme_should_fail(req); - blk_mq_complete_request(req); + if (likely(!blk_should_fake_timeout(req->q))) + blk_mq_complete_request(req); } static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index cf87eb27879f..eb17fea8075c 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -2802,7 +2802,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) blk_update_request(req, BLK_STS_OK, blk_rq_bytes(req) - proc_bytes); blk_mq_requeue_request(req, true); - } else { + } else if (likely(!blk_should_fake_timeout(req->q))) { blk_mq_complete_request(req); } } diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index e01889394c84..a4f6f2e62b1d 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -256,7 +256,8 @@ static void scm_request_finish(struct scm_request *scmrq) for (i = 0; i < nr_requests_per_io && scmrq->request[i]; i++) { error = blk_mq_rq_to_pdu(scmrq->request[i]); *error = scmrq->error; - blk_mq_complete_request(scmrq->request[i]); + if (likely(!blk_should_fake_timeout(scmrq->request[i]->q))) + blk_mq_complete_request(scmrq->request[i]); } atomic_dec(&bdev->queued_reqs); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0ba7a65e7c8d..6ca91d09eca1 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1589,18 +1589,12 @@ static blk_status_t scsi_mq_prep_fn(struct request *req) static void scsi_mq_done(struct scsi_cmnd *cmd) { + if (unlikely(blk_should_fake_timeout(cmd->request->q))) + return; if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) return; trace_scsi_dispatch_cmd_done(cmd); - - /* - * If the block layer didn't complete the request due to a timeout - * injection, scsi must clear its internal completed state so that the - * timeout handler will see it needs to escalate its own error - * recovery. - */ - if (unlikely(!blk_mq_complete_request(cmd->request))) - clear_bit(SCMD_STATE_COMPLETE, &cmd->state); + blk_mq_complete_request(cmd->request); } static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d6fcae17da5a..8e6ab766aef7 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -503,8 +503,7 @@ void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); -bool blk_mq_complete_request(struct request *rq); -void blk_mq_force_complete_rq(struct request *rq); +void blk_mq_complete_request(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); @@ -537,6 +536,15 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); +bool __blk_should_fake_timeout(struct request_queue *q); +static inline bool blk_should_fake_timeout(struct request_queue *q) +{ + if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && + test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) + return __blk_should_fake_timeout(q); + return false; +} + /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted -- cgit v1.2.3 From 4c8fc19686dc761f600833fc9b8fa390eaf73dd5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:48 +0200 Subject: blk-mq: remove the get_cpu/put_cpu pair in blk_mq_complete_request We don't really care if we get migrated during the I/O completion. In the worth case we either perform an IPI that wasn't required, or complete the request on a CPU which we just migrated off. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f4f227cf830..95125bfe779b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -685,7 +685,7 @@ void blk_mq_complete_request(struct request *rq) return; } - cpu = get_cpu(); + cpu = raw_smp_processor_id(); if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); @@ -697,7 +697,6 @@ void blk_mq_complete_request(struct request *rq) } else { __blk_mq_complete_request(rq); } - put_cpu(); } EXPORT_SYMBOL(blk_mq_complete_request); -- cgit v1.2.3 From 963395269c758641e1cb7208f3bdce6824ea608d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:49 +0200 Subject: blk-mq: factor out a blk_mq_complete_need_ipi helper Add a helper to decide if we can complete locally or need an IPI. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 95125bfe779b..961635b40999 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -654,6 +654,24 @@ static void __blk_mq_complete_request_remote(void *data) __blk_mq_complete_request(data); } +static inline bool blk_mq_complete_need_ipi(struct request *rq) +{ + int cpu = raw_smp_processor_id(); + + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; + + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); +} + /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed @@ -663,11 +681,6 @@ static void __blk_mq_complete_request_remote(void *data) **/ void blk_mq_complete_request(struct request *rq) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - bool shared = false; - int cpu; - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* @@ -675,25 +688,15 @@ void blk_mq_complete_request(struct request *rq) * to redirect the completion. */ if (rq->cmd_flags & REQ_HIPRI) { - q->mq_ops->complete(rq); - return; - } - - if (!IS_ENABLED(CONFIG_SMP) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - __blk_mq_complete_request(rq); + rq->q->mq_ops->complete(rq); return; } - cpu = raw_smp_processor_id(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); - - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { + if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; rq->csd.info = rq; rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); + smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { __blk_mq_complete_request(rq); } -- cgit v1.2.3 From 40d09b53bfc557af7481b9d80f060a7ac9c7d314 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 11 Jun 2020 08:44:50 +0200 Subject: blk-mq: add a new blk_mq_complete_request_remote API This is a variant of blk_mq_complete_request_remote that only completes the request if it needs to be bounced to another CPU or a softirq. If the request can be completed locally the function returns false and lets the driver complete it without requring and indirect function call. Reviewed-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 45 ++++++++++++++++++++++++++------------------- include/linux/blk-mq.h | 1 + 2 files changed, 27 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 961635b40999..b8738b3c6d06 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -632,8 +632,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu) return 0; } -static void __blk_mq_complete_request(struct request *rq) + +static void __blk_mq_complete_request_remote(void *data) { + struct request *rq = data; + /* * For most of single queue controllers, there is only one irq vector * for handling I/O completion, and the only irq's affinity is set @@ -649,11 +652,6 @@ static void __blk_mq_complete_request(struct request *rq) rq->q->mq_ops->complete(rq); } -static void __blk_mq_complete_request_remote(void *data) -{ - __blk_mq_complete_request(data); -} - static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); @@ -672,14 +670,7 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } -/** - * blk_mq_complete_request - end I/O on a request - * @rq: the request being processed - * - * Description: - * Complete a request by scheduling the ->complete_rq operation. - **/ -void blk_mq_complete_request(struct request *rq) +bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); @@ -687,10 +678,8 @@ void blk_mq_complete_request(struct request *rq) * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if (rq->cmd_flags & REQ_HIPRI) { - rq->q->mq_ops->complete(rq); - return; - } + if (rq->cmd_flags & REQ_HIPRI) + return false; if (blk_mq_complete_need_ipi(rq)) { rq->csd.func = __blk_mq_complete_request_remote; @@ -698,8 +687,26 @@ void blk_mq_complete_request(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { - __blk_mq_complete_request(rq); + if (rq->q->nr_hw_queues > 1) + return false; + blk_mq_trigger_softirq(rq); } + + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8e6ab766aef7..1641ec6cd7e5 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -504,6 +504,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); +bool blk_mq_complete_request_remote(struct request *rq); bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); bool blk_mq_queue_stopped(struct request_queue *q); -- cgit v1.2.3 From b5bd357cf8b65d31e32b1668293cbeedb6c06334 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:23 +0000 Subject: block: add docs for gendisk / request_queue refcount helpers This adds documentation for the gendisk / request_queue refcount helpers. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 13 +++++++++++++ block/genhd.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 62 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..13777c0c97f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -321,6 +321,13 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); +/** + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for + * + * Decrements the refcount of the request_queue kobject. When this reaches 0 + * we'll have blk_release_queue() called. + */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); @@ -598,6 +605,12 @@ struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) } EXPORT_SYMBOL(blk_alloc_queue); +/** + * blk_get_queue - increment the request_queue refcount + * @q: the request_queue structure to increment the refcount for + * + * Increment the refcount of the request_queue kobject. + */ bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { diff --git a/block/genhd.c b/block/genhd.c index 1a7659327664..f741613d731f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -876,6 +876,20 @@ static void invalidate_partition(struct gendisk *disk, int partno) bdput(bdev); } +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; @@ -1514,6 +1528,23 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) return 0; } +/** + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk + * + * This function releases all allocated resources of the gendisk. + * + * The struct gendisk refcount is incremented with get_gendisk() or + * get_disk_and_module(), and its refcount is decremented with + * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this + * function is called. + * + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); @@ -1727,6 +1758,13 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } EXPORT_SYMBOL(__alloc_disk_node); +/** + * get_disk_and_module - increments the gendisk and gendisk fops module refcount + * @disk: the struct gendisk to to increment the refcount for + * + * This increments the refcount for the struct gendisk, and the gendisk's + * fops module owner. + */ struct kobject *get_disk_and_module(struct gendisk *disk) { struct module *owner; @@ -1747,6 +1785,13 @@ struct kobject *get_disk_and_module(struct gendisk *disk) } EXPORT_SYMBOL(get_disk_and_module); +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + */ void put_disk(struct gendisk *disk) { if (disk) @@ -1754,7 +1799,10 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/* +/** + * put_disk_and_module - decrements the module and gendisk refcount + * @disk: the struct gendisk to to decrement the refcount for + * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). */ -- cgit v1.2.3 From 763b58923aeb0a06c5a5f7e5fbb4c654c644d91d Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:24 +0000 Subject: block: clarify context for refcount increment helpers Let us clarify the context under which the helpers to increment the refcount for the gendisk and request_queue can be called under. We make this explicit on the places where we may sleep with might_sleep(). We don't address the decrement context yet, as that needs some extra work and fixes, but will be addressed in the next patch. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/genhd.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 13777c0c97f0..f68398cb2ef6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -610,6 +610,8 @@ EXPORT_SYMBOL(blk_alloc_queue); * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. + * + * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { diff --git a/block/genhd.c b/block/genhd.c index f741613d731f..1be86b1f43ec 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -985,11 +985,15 @@ static ssize_t disk_badblocks_store(struct device *dev, * * This function gets the structure containing partitioning * information for the given device @devt. + * + * Context: can sleep */ struct gendisk *get_gendisk(dev_t devt, int *partno) { struct gendisk *disk = NULL; + might_sleep(); + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { struct kobject *kobj; @@ -1764,6 +1768,8 @@ EXPORT_SYMBOL(__alloc_disk_node); * * This increments the refcount for the struct gendisk, and the gendisk's * fops module owner. + * + * Context: Any context. */ struct kobject *get_disk_and_module(struct gendisk *disk) { -- cgit v1.2.3 From e8c7d14ac6c37c173ec606907d38802b00302988 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:25 +0000 Subject: block: revert back to synchronous request_queue removal Commit dc9edc44de6c ("block: Fix a blk_exit_rl() regression") merged on v4.12 moved the work behind blk_release_queue() into a workqueue after a splat floated around which indicated some work on blk_release_queue() could sleep in blk_exit_rl(). This splat would be possible when a driver called blk_put_queue() or blk_cleanup_queue() (which calls blk_put_queue() as its final call) from an atomic context. blk_put_queue() decrements the refcount for the request_queue kobject, and upon reaching 0 blk_release_queue() is called. Although blk_exit_rl() is now removed through commit db6d99523560 ("block: remove request_list code") on v5.0, we reserve the right to be able to sleep within blk_release_queue() context. The last reference for the request_queue must not be called from atomic context. *When* the last reference to the request_queue reaches 0 varies, and so let's take the opportunity to document when that is expected to happen and also document the context of the related calls as best as possible so we can avoid future issues, and with the hopes that the synchronous request_queue removal sticks. We revert back to synchronous request_queue removal because asynchronous removal creates a regression with expected userspace interaction with several drivers. An example is when removing the loopback driver, one uses ioctls from userspace to do so, but upon return and if successful, one expects the device to be removed. Likewise if one races to add another device the new one may not be added as it is still being removed. This was expected behavior before and it now fails as the device is still present and busy still. Moving to asynchronous request_queue removal could have broken many scripts which relied on the removal to have been completed if there was no error. Document this expectation as well so that this doesn't regress userspace again. Using asynchronous request_queue removal however has helped us find other bugs. In the future we can test what could break with this arrangement by enabling CONFIG_DEBUG_KOBJECT_RELEASE. While at it, update the docs with the context expectations for the request_queue / gendisk refcount decrement, and make these expectations explicit by using might_sleep(). Fixes: dc9edc44de6c ("block: Fix a blk_exit_rl() regression") Suggested-by: Nicolai Stange Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Reviewed-by: Bart Van Assche Cc: Bart Van Assche Cc: Omar Sandoval Cc: Hannes Reinecke Cc: Nicolai Stange Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: yu kuai Signed-off-by: Jens Axboe --- block/blk-core.c | 8 ++++++++ block/blk-sysfs.c | 43 ++++++++++++++++++++++--------------------- block/genhd.c | 17 +++++++++++++++++ include/linux/blkdev.h | 2 -- 4 files changed, 47 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index f68398cb2ef6..a99b22fac38a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -327,6 +327,9 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only); * * Decrements the refcount of the request_queue kobject. When this reaches 0 * we'll have blk_release_queue() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void blk_put_queue(struct request_queue *q) { @@ -359,9 +362,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying); * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. + * + * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { + /* cannot be called from atomic context */ + might_sleep(); + WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 02643e149d5e..561624d4cc4e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -873,22 +873,32 @@ static void blk_exit_queue(struct request_queue *q) bdi_put(q->backing_dev_info); } - /** - * __blk_release_queue - release a request queue - * @work: pointer to the release_work member of the request queue to be released + * blk_release_queue - releases all allocated resources of the request_queue + * @kobj: pointer to a kobject, whose container is a request_queue + * + * This function releases all allocated resources of the request queue. + * + * The struct request_queue refcount is incremented with blk_get_queue() and + * decremented with blk_put_queue(). Once the refcount reaches 0 this function + * is called. + * + * For drivers that have a request_queue on a gendisk and added with + * __device_add_disk() the refcount to request_queue will reach 0 with + * the last put_disk() called by the driver. For drivers which don't use + * __device_add_disk() this happens with blk_cleanup_queue(). * - * Description: - * This function is called when a block device is being unregistered. The - * process of releasing a request queue starts with blk_cleanup_queue, which - * set the appropriate flags and then calls blk_put_queue, that decrements - * the reference counter of the request queue. Once the reference counter - * of the request queue reaches zero, blk_release_queue is called to release - * all allocated resources of the request queue. + * Drivers exist which depend on the release of the request_queue to be + * synchronous, it should not be deferred. + * + * Context: can sleep */ -static void __blk_release_queue(struct work_struct *work) +static void blk_release_queue(struct kobject *kobj) { - struct request_queue *q = container_of(work, typeof(*q), release_work); + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + might_sleep(); if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) blk_stat_remove_callback(q, q->poll_cb); @@ -917,15 +927,6 @@ static void __blk_release_queue(struct work_struct *work) call_rcu(&q->rcu_head, blk_free_queue_rcu); } -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - INIT_WORK(&q->release_work, __blk_release_queue); - schedule_work(&q->release_work); -} - static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, diff --git a/block/genhd.c b/block/genhd.c index 1be86b1f43ec..60ae4e1b4d38 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -889,12 +889,19 @@ static void invalidate_partition(struct gendisk *disk, int partno) * The final removal of the struct gendisk happens when its refcount reaches 0 * with put_disk(), which should be called after del_gendisk(), if * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep */ void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; struct hd_struct *part; + might_sleep(); + blk_integrity_del(disk); disk_del_events(disk); @@ -1548,11 +1555,15 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) * drivers we also call blk_put_queue() for them, and we expect the * request_queue refcount to reach 0 at this point, and so the request_queue * will also be freed prior to the disk. + * + * Context: can sleep */ static void disk_release(struct device *dev) { struct gendisk *disk = dev_to_disk(dev); + might_sleep(); + blk_free_devt(dev->devt); disk_release_events(disk); kfree(disk->random); @@ -1797,6 +1808,9 @@ EXPORT_SYMBOL(get_disk_and_module); * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk(struct gendisk *disk) { @@ -1811,6 +1825,9 @@ EXPORT_SYMBOL(put_disk); * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. */ void put_disk_and_module(struct gendisk *disk) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 98712cfc7a34..e214e0e9f868 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -584,8 +584,6 @@ struct request_queue { size_t cmd_size; - struct work_struct release_work; - #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; -- cgit v1.2.3 From 85e0cbbb8a79537dbc465e9deb449a08b2b092a6 Mon Sep 17 00:00:00 2001 From: Luis Chamberlain Date: Fri, 19 Jun 2020 20:47:30 +0000 Subject: block: create the request_queue debugfs_dir on registration We were only creating the request_queue debugfs_dir only for make_request block drivers (multiqueue), but never for request-based block drivers. We did this as we were only creating non-blktrace additional debugfs files on that directory for make_request drivers. However, since blktrace *always* creates that directory anyway, we special-case the use of that directory on blktrace. Other than this being an eye-sore, this exposes request-based block drivers to the same debugfs fragile race that used to exist with make_request block drivers where if we start adding files onto that directory we can later run a race with a double removal of dentries on the directory if we don't deal with this carefully on blktrace. Instead, just simplify things by always creating the request_queue debugfs_dir on request_queue registration. Rename the mutex also to reflect the fact that this is used outside of the blktrace context. Signed-off-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 8 +------ block/blk-mq-debugfs.c | 5 ----- block/blk-sysfs.c | 9 ++++++++ block/blk.h | 2 -- include/linux/blkdev.h | 5 +++-- kernel/trace/blktrace.c | 58 ++++++++++++++++++++++--------------------------- 6 files changed, 39 insertions(+), 48 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index a99b22fac38a..a9769c1a2875 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -51,9 +51,7 @@ #include "blk-pm.h" #include "blk-rq-qos.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); @@ -555,9 +553,7 @@ struct request_queue *__blk_alloc_queue(int node_id) kobject_init(&q->kobj, &blk_queue_ktype); -#ifdef CONFIG_BLK_DEV_IO_TRACE - mutex_init(&q->blk_trace_mutex); -#endif + mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); @@ -1931,9 +1927,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); -#ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); -#endif return 0; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 15df3a36e9fa..a2800bc56fb4 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -824,9 +824,6 @@ void blk_mq_debugfs_register(struct request_queue *q) struct blk_mq_hw_ctx *hctx; int i; - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); - debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); /* @@ -857,9 +854,7 @@ void blk_mq_debugfs_register(struct request_queue *q) void blk_mq_debugfs_unregister(struct request_queue *q) { - debugfs_remove_recursive(q->debugfs_dir); q->sched_debugfs_dir = NULL; - q->debugfs_dir = NULL; } static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 561624d4cc4e..be67952e7be2 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -917,6 +918,9 @@ static void blk_release_queue(struct kobject *kobj) blk_mq_release(q); blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); + debugfs_remove_recursive(q->debugfs_dir); + mutex_unlock(&q->debugfs_mutex); if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); @@ -989,6 +993,11 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); + mutex_unlock(&q->debugfs_mutex); + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); diff --git a/block/blk.h b/block/blk.h index 8ba4a5e4fe07..3a120a070dac 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,9 +14,7 @@ /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) -#ifdef CONFIG_DEBUG_FS extern struct dentry *blk_debugfs_root; -#endif struct blk_flush_queue { unsigned int flush_pending_idx:1; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e214e0e9f868..c0701237116d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -528,9 +528,9 @@ struct request_queue { unsigned int sg_timeout; unsigned int sg_reserved_size; int node; + struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; - struct mutex blk_trace_mutex; #endif /* * for flush operations @@ -574,8 +574,9 @@ struct request_queue { struct list_head tag_set_list; struct bio_set bio_split; -#ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; + +#ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 46c273f4dec5..c086c38f4954 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -348,7 +348,7 @@ static int __blk_trace_remove(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (!bt) return -EINVAL; @@ -362,9 +362,9 @@ int blk_trace_remove(struct request_queue *q) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_remove(q); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -483,14 +483,11 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct dentry *dir = NULL; int ret; - lockdep_assert_held(&q->blk_trace_mutex); + lockdep_assert_held(&q->debugfs_mutex); if (!buts->buf_size || !buts->buf_nr) return -EINVAL; - if (!blk_debugfs_root) - return -ENOENT; - strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; @@ -505,7 +502,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, * we can be. */ if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { pr_warn("Concurrent blktraces are not allowed on %s\n", buts->name); return -EBUSY; @@ -524,18 +521,15 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->msg_data) goto err; -#ifdef CONFIG_BLK_DEBUG_FS /* - * When tracing whole make_request drivers (multiqueue) block devices, - * reuse the existing debugfs directory created by the block layer on - * init. For request-based block devices, all partitions block devices, + * When tracing the whole disk reuse the existing debugfs directory + * created by the block layer on init. For partitions block devices, * and scsi-generic block devices we create a temporary new debugfs * directory that will be removed once the trace ends. */ - if (queue_is_mq(q) && bdev && bdev == bdev->bd_contains) + if (bdev && bdev == bdev->bd_contains) dir = q->debugfs_dir; else -#endif bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); /* @@ -617,9 +611,9 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_setup(q, name, dev, bdev, arg); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -665,7 +659,7 @@ static int __blk_trace_startstop(struct request_queue *q, int start) struct blk_trace *bt; bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -705,9 +699,9 @@ int blk_trace_startstop(struct request_queue *q, int start) { int ret; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); ret = __blk_trace_startstop(q, start); - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -736,7 +730,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) if (!q) return -ENXIO; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); switch (cmd) { case BLKTRACESETUP: @@ -763,7 +757,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) break; } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); return ret; } @@ -774,14 +768,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) **/ void blk_trace_shutdown(struct request_queue *q) { - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); if (rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex))) { + lockdep_is_held(&q->debugfs_mutex))) { __blk_trace_startstop(q, 0); __blk_trace_remove(q); } - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); } #ifdef CONFIG_BLK_CGROUP @@ -1662,7 +1656,7 @@ static int blk_trace_remove_queue(struct request_queue *q) struct blk_trace *bt; bt = rcu_replace_pointer(q->blk_trace, NULL, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (bt == NULL) return -EINVAL; @@ -1837,10 +1831,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { ret = sprintf(buf, "%u\n", !!bt); goto out_unlock_bdev; @@ -1858,7 +1852,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, ret = sprintf(buf, "%llu\n", bt->end_lba); out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: @@ -1901,10 +1895,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (q == NULL) goto out_bdput; - mutex_lock(&q->blk_trace_mutex); + mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); if (attr == &dev_attr_enable) { if (!!value == !!bt) { ret = 0; @@ -1921,7 +1915,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (bt == NULL) { ret = blk_trace_setup_queue(q, bdev); bt = rcu_dereference_protected(q->blk_trace, - lockdep_is_held(&q->blk_trace_mutex)); + lockdep_is_held(&q->debugfs_mutex)); } if (ret == 0) { @@ -1936,7 +1930,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, } out_unlock_bdev: - mutex_unlock(&q->blk_trace_mutex); + mutex_unlock(&q->debugfs_mutex); out_bdput: bdput(bdev); out: -- cgit v1.2.3 From 1f4fe21cf45c799a2fef41ae23dd2a8a8dbb93b7 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 19 Jun 2020 19:49:49 -0500 Subject: block: bio: Use struct_size() in kmalloc() Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Addresses-KSPP-ID: https://github.com/KSPP/linux/issues/83 Signed-off-by: Jens Axboe --- block/bio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index a7366c02c9b5..fb5533416fa6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -444,9 +444,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, if (nr_iovecs > UIO_MAXIOV) return NULL; - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); front_pad = 0; inline_vecs = nr_iovecs; } else { -- cgit v1.2.3 From f61d6e259c7ebb9a134dee5cd0b32c192d726984 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 19 Jun 2020 18:08:30 -0500 Subject: blk-iocost: Use struct_size() in kzalloc_node() Make use of the struct_size() helper instead of an open-coded version in order to avoid any potential type mistakes. This code was detected with the help of Coccinelle and, audited and fixed manually. Signed-off-by: Gustavo A. R. Silva Addresses-KSPP-ID: https://github.com/KSPP/linux/issues/83 Signed-off-by: Jens Axboe --- block/blk-iocost.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8ac4aad66ebc..cea5ee9be639 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2045,8 +2045,7 @@ static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), - gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); if (!iocg) return NULL; -- cgit v1.2.3 From f3bdc62fd82ed93dbe4d049eacba310de7eb2a6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 17 Jun 2020 15:58:23 +0200 Subject: blktrace: Provide event for request merging Currently blk-mq does not report any event when two requests get merged in the elevator. This then results in difficult to understand sequence of events like: ... 8,0 34 1579 0.608765271 2718 I WS 215023504 + 40 [dbench] 8,0 34 1584 0.609184613 2719 A WS 215023544 + 56 <- (8,4) 2160568 8,0 34 1585 0.609184850 2719 Q WS 215023544 + 56 [dbench] 8,0 34 1586 0.609188524 2719 G WS 215023544 + 56 [dbench] 8,0 3 602 0.609684162 773 D WS 215023504 + 96 [kworker/3:1H] 8,0 34 1591 0.609843593 0 C WS 215023504 + 96 [0] and you can only guess (after quite some headscratching since the above excerpt is intermixed with a lot of other IO) that request 215023544+56 got merged to request 215023504+40. Provide proper event for request merging like we used to do in the legacy block layer. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 ++ include/trace/events/block.h | 15 +++++++++++++++ kernel/trace/blktrace.c | 10 ++++++++++ 3 files changed, 27 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..9c9fb21584b6 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -793,6 +793,8 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); + trace_block_rq_merge(q, next); + /* * ownership of bio passed from next to req, return 'next' for * the caller to free diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 93b114226af8..34d64ca306b1 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -211,6 +211,21 @@ DEFINE_EVENT(block_rq, block_rq_issue, TP_ARGS(q, rq) ); +/** + * block_rq_merge - merge request with another one in the elevator + * @q: queue holding operation + * @rq: block IO operation operation request + * + * Called when block operation request @rq from queue @q is merged to another + * request queued in the elevator. + */ +DEFINE_EVENT(block_rq, block_rq_merge, + + TP_PROTO(struct request_queue *q, struct request *rq), + + TP_ARGS(q, rq) +); + /** * block_bio_bounce - used bounce buffer when processing block operation * @q: queue holding the block operation diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c086c38f4954..7ba62d68885a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -860,6 +860,13 @@ static void blk_add_trace_rq_issue(void *ignore, blk_trace_request_get_cgid(q, rq)); } +static void blk_add_trace_rq_merge(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, + blk_trace_request_get_cgid(q, rq)); +} + static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) @@ -1144,6 +1151,8 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); + ret = register_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); + WARN_ON(ret); ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); @@ -1190,6 +1199,7 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_merge(blk_add_trace_rq_merge, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); -- cgit v1.2.3 From 826f2f48da8c331ac51e1381998d318012d66550 Mon Sep 17 00:00:00 2001 From: Guo Xuenan Date: Sun, 28 Jun 2020 09:56:25 -0400 Subject: blk-rq-qos: remove redundant finish_wait to rq_qos_wait. It is no need do finish_wait twice after acquiring inflight. Signed-off-by: Guo Xuenan Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..18f3eab9f768 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -273,8 +273,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(&rqw->wait, &data.wq); - /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token -- cgit v1.2.3 From db9819c76c1fd48c30699381c94bba5c95dd467e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:47 +0200 Subject: block: remove bio_disassociate_blkg bio_disassociate_blkg has two callers, of which one immediately assigns a new value to >bi_blkg. Just open code the function in the two callers. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 27 ++++++++------------------- include/linux/bio.h | 2 -- 2 files changed, 8 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index fb5533416fa6..8aef4460b32e 100644 --- a/block/bio.c +++ b/block/bio.c @@ -234,8 +234,12 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_blkg(bio); - +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif if (bio_integrity(bio)) bio_integrity_free(bio); @@ -1625,21 +1629,6 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -/** - * bio_disassociate_blkg - puts back the blkg reference if associated - * @bio: target bio - * - * Helper to disassociate the blkg from @bio if a blkg is associated. - */ -void bio_disassociate_blkg(struct bio *bio) -{ - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} -EXPORT_SYMBOL_GPL(bio_disassociate_blkg); - /** * __bio_associate_blkg - associate a bio with the a blkg * @bio: target bio @@ -1656,8 +1645,8 @@ EXPORT_SYMBOL_GPL(bio_disassociate_blkg); */ static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - bio_disassociate_blkg(bio); - + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); bio->bi_blkg = blkg_tryget_closest(blkg); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 0282f8aa8593..4cd229e175c0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -497,13 +497,11 @@ static inline void bio_associate_blkg_from_page(struct bio *bio, #endif #ifdef CONFIG_BLK_CGROUP -void bio_disassociate_blkg(struct bio *bio); void bio_associate_blkg(struct bio *bio); void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); void bio_clone_blkg_association(struct bio *dst, struct bio *src); #else /* CONFIG_BLK_CGROUP */ -static inline void bio_disassociate_blkg(struct bio *bio) { } static inline void bio_associate_blkg(struct bio *bio) { } static inline void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) -- cgit v1.2.3 From d92c370a16cbe0276954c761b874bd024a7e4fac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:48 +0200 Subject: block: really clone the block cgroup in bio_clone_blkg_association bio_clone_blkg_association is supposed to clone the associatation, but actually ends up doing a search with a tryget. As we know we have a reference on the source cgroup just get an unconditional additional reference to it and call it a day. That also removes the need for a RCU critical section. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 8aef4460b32e..e1d01acce807 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1737,12 +1737,12 @@ EXPORT_SYMBOL_GPL(bio_associate_blkg); */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - rcu_read_lock(); - - if (src->bi_blkg) - __bio_associate_blkg(dst, src->bi_blkg); - - rcu_read_unlock(); + if (src->bi_blkg) { + if (dst->bi_blkg) + blkg_put(dst->bi_blkg); + blkg_get(src->bi_blkg); + dst->bi_blkg = src->bi_blkg; + } } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ -- cgit v1.2.3 From 2badf06cf906c7af4bdd4bc1da62890c8e686341 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:49 +0200 Subject: block: merge __bio_associate_blkg into bio_associate_blkg_from_css Merge __bio_associate_blkg into the only caller, which allows to slightly reduce the RCU crticial section and better explain the code flow. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index e1d01acce807..bc8de2432e36 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1628,52 +1628,33 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP - -/** - * __bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified @blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - if (bio->bi_blkg) - blkg_put(bio->bi_blkg); - bio->bi_blkg = blkg_tryget_closest(blkg); -} - /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; + struct blkcg_gq *blkg = q->root_blkg; - rcu_read_lock(); + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); - if (!css || !css->parent) - blkg = q->root_blkg; - else + rcu_read_lock(); + if (css && css->parent) blkg = blkg_lookup_create(css_to_blkcg(css), q); - - __bio_associate_blkg(bio, blkg); - + bio->bi_blkg = blkg_tryget_closest(blkg); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); -- cgit v1.2.3 From a18b9b1590ca64f877588700de32c9ad236f405c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:50 +0200 Subject: block: move bio_associate_blkg_from_page to mm/page_io.c bio_associate_blkg_from_page is a special purpose helper for swap bios that doesn't need access to bio internals. Move it to the swap code instead of having it in bio.c. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 26 -------------------------- include/linux/bio.h | 7 ------- mm/page_io.c | 17 +++++++++++++++++ 3 files changed, 17 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index bc8de2432e36..901d22715dd4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1659,32 +1659,6 @@ void bio_associate_blkg_from_css(struct bio *bio, } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); -#ifdef CONFIG_MEMCG -/** - * bio_associate_blkg_from_page - associate a bio with the page's blkg - * @bio: target bio - * @page: the page to lookup the blkcg from - * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's - * root_blkg. - */ -void bio_associate_blkg_from_page(struct bio *bio, struct page *page) -{ - struct cgroup_subsys_state *css; - - if (!page->mem_cgroup) - return; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -#endif /* CONFIG_MEMCG */ - /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio diff --git a/include/linux/bio.h b/include/linux/bio.h index 4cd229e175c0..c6d765382926 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -489,13 +489,6 @@ do { \ #define bio_dev(bio) \ disk_devt((bio)->bi_disk) -#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -void bio_associate_blkg_from_page(struct bio *bio, struct page *page); -#else -static inline void bio_associate_blkg_from_page(struct bio *bio, - struct page *page) { } -#endif - #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); void bio_associate_blkg_from_css(struct bio *bio, diff --git a/mm/page_io.c b/mm/page_io.c index e8726f3e3820..ccda76790088 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -277,6 +277,23 @@ static inline void count_swpout_vm_event(struct page *page) count_vm_events(PSWPOUT, hpage_nr_pages(page)); } +#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) +static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +{ + struct cgroup_subsys_state *css; + + if (!page->mem_cgroup) + return; + + rcu_read_lock(); + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + rcu_read_unlock(); +} +#else +#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ + int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func) { -- cgit v1.2.3 From 28fc591ff9d64cbc2b780be95ee3fde8f6ade7fd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:51 +0200 Subject: block: move the bio cgroup associatation helpers to blk-cgroup.c Keep the cgroup code together. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 75 --------------------------------- block/blk-cgroup.c | 103 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/blk-cgroup.h | 30 ------------- 3 files changed, 101 insertions(+), 107 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 901d22715dd4..fc1299f9d86a 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1627,81 +1627,6 @@ int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) } EXPORT_SYMBOL(bioset_init_from_src); -#ifdef CONFIG_BLK_CGROUP -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. An association failure is handled by walking up - * the blkg tree. Therefore, the blkg associated can be anything between @blkg - * and q->root_blkg. This situation only happens when a cgroup is dying and - * then the remaining bios will spill to the closest alive blkg. - * - * A reference will be taken on the blkg and will be released when @bio is - * freed. - */ -void bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg = q->root_blkg; - - if (bio->bi_blkg) - blkg_put(bio->bi_blkg); - - rcu_read_lock(); - if (css && css->parent) - blkg = blkg_lookup_create(css_to_blkcg(css), q); - bio->bi_blkg = blkg_tryget_closest(blkg); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - -/** - * bio_associate_blkg - associate a bio with a blkg - * @bio: target bio - * - * Associate @bio with the blkg found from the bio's css and request_queue. - * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is - * already associated, the css is reused and association redone as the - * request_queue may have changed. - */ -void bio_associate_blkg(struct bio *bio) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; - else - css = blkcg_css(); - - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg); - -/** - * bio_clone_blkg_association - clone blkg association from src to dst bio - * @dst: destination bio - * @src: source bio - */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) -{ - if (src->bi_blkg) { - if (dst->bi_blkg) - blkg_put(dst->bi_blkg); - blkg_get(src->bi_blkg); - dst->bi_blkg = src->bi_blkg; - } -} -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); -#endif /* CONFIG_BLK_CGROUP */ - static void __init biovec_init_slabs(void) { int i; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ecc897b225c..bb0607bfd771 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -328,7 +328,7 @@ err_free_blkg: * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, +static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; @@ -377,7 +377,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, * This looks up or creates the blkg representing the unique pair * of the blkcg and the request_queue. */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg = blkg_lookup(blkcg, q); @@ -1727,6 +1727,105 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @blkg: blkg to get + * + * This needs to be called rcu protected. As the failure mode here is to walk + * up the blkg tree, this ensure that the blkg->parent pointers are always + * valid. This returns the blkg that it ended up taking a reference on or %NULL + * if no reference was taken. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) +{ + struct blkcg_gq *ret_blkg = NULL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + while (blkg) { + if (blkg_tryget(blkg)) { + ret_blkg = blkg; + break; + } + blkg = blkg->parent; + } + + return ret_blkg; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg = q->root_blkg; + + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); + + rcu_read_lock(); + if (css && css->parent) + blkg = blkg_lookup_create(css_to_blkcg(css), q); + bio->bi_blkg = blkg_tryget_closest(blkg); + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_blkg) { + if (dst->bi_blkg) + blkg_put(dst->bi_blkg); + blkg_get(src->bi_blkg); + dst->bi_blkg = src->bi_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index a57ebe2f00ab..60df97202314 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -183,10 +183,6 @@ extern bool blkcg_debug_stats; struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint); -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); int blkcg_init_queue(struct request_queue *q); void blkcg_exit_queue(struct request_queue *q); @@ -480,32 +476,6 @@ static inline bool blkg_tryget(struct blkcg_gq *blkg) return blkg && percpu_ref_tryget(&blkg->refcnt); } -/** - * blkg_tryget_closest - try and get a blkg ref on the closet blkg - * @blkg: blkg to get - * - * This needs to be called rcu protected. As the failure mode here is to walk - * up the blkg tree, this ensure that the blkg->parent pointers are always - * valid. This returns the blkg that it ended up taking a reference on or %NULL - * if no reference was taken. - */ -static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) -{ - struct blkcg_gq *ret_blkg = NULL; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - while (blkg) { - if (blkg_tryget(blkg)) { - ret_blkg = blkg; - break; - } - blkg = blkg->parent; - } - - return ret_blkg; -} - /** * blkg_put - put a blkg reference * @blkg: blkg to put -- cgit v1.2.3 From 8c5462875224a6d81683e733087f501f58fccd4c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:52 +0200 Subject: block: merge blkg_lookup_create and __blkg_lookup_create No good reason to keep these two functions split. Acked-by: Tejun Heo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 49 ++++++++++++++++++------------------------------- 1 file changed, 18 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index bb0607bfd771..6aedb73ffbf4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -316,30 +316,35 @@ err_free_blkg: } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and @q->queue_lock. + * should be called under RCU read lock and takes @q->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ -static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; + unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - blkg = __blkg_lookup(blkcg, q, true); + blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + goto found; + /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest @@ -362,34 +367,16 @@ static struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, } blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; + if (IS_ERR(blkg)) { + blkg = ret_blkg; + break; + } if (pos == blkcg) - return blkg; - } -} - -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - - if (unlikely(!blkg)) { - unsigned long flags; - - spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup_create(blkcg, q); - spin_unlock_irqrestore(&q->queue_lock, flags); + break; } +found: + spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } -- cgit v1.2.3 From a5b97526bf2860fe8306959f19ee428d1f5cc732 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:53 +0200 Subject: block: bypass blkg_tryget_closest for the root_blkg The root_blkg is only torn down at the very end of removing a queue. So in the I/O submission path is always has a life reference and we can just grab another one using blkg_get instead of doing a tryget and parent walk that won't lead anywhere. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6aedb73ffbf4..0912820d4f81 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1758,16 +1758,21 @@ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg = q->root_blkg; if (bio->bi_blkg) blkg_put(bio->bi_blkg); - rcu_read_lock(); - if (css && css->parent) + if (css && css->parent) { + struct blkcg_gq *blkg; + + rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), q); - bio->bi_blkg = blkg_tryget_closest(blkg); - rcu_read_unlock(); + bio->bi_blkg = blkg_tryget_closest(blkg); + rcu_read_unlock(); + } else { + blkg_get(q->root_blkg); + bio->bi_blkg = q->root_blkg; + } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); -- cgit v1.2.3 From 13c7863d48c160152c42c62700a722293bddeca4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:54 +0200 Subject: block: move the initial blkg lookup into blkg_tryget_closest By moving the initial blkg lookup into blkg_tryget_closest we get a nicely self contained routines that does all the RCU locking. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0912820d4f81..d21ec2acd716 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1716,19 +1716,20 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg - * @blkg: blkg to get + * @bio: target bio + * @css: target css * - * This needs to be called rcu protected. As the failure mode here is to walk - * up the blkg tree, this ensure that the blkg->parent pointers are always - * valid. This returns the blkg that it ended up taking a reference on or %NULL - * if no reference was taken. + * As the failure mode here is to walk up the blkg tree, this ensure that the + * blkg->parent pointers are always valid. This returns the blkg that it ended + * up taking a reference on or %NULL if no reference was taken. */ -static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) +static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, + struct cgroup_subsys_state *css) { - struct blkcg_gq *ret_blkg = NULL; - - WARN_ON_ONCE(!rcu_read_lock_held()); + struct blkcg_gq *blkg, *ret_blkg = NULL; + rcu_read_lock(); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1736,6 +1737,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) } blkg = blkg->parent; } + rcu_read_unlock(); return ret_blkg; } @@ -1757,21 +1759,14 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg) void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { - struct request_queue *q = bio->bi_disk->queue; - if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { - struct blkcg_gq *blkg; - - rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), q); - bio->bi_blkg = blkg_tryget_closest(blkg); - rcu_read_unlock(); + bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(q->root_blkg); - bio->bi_blkg = q->root_blkg; + blkg_get(bio->bi_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); -- cgit v1.2.3 From 93b8063804b62b55248e16499d853e1b20eff905 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:57 +0200 Subject: blk-cgroup: move rcu locking from blkcg_bio_issue_check to blk_throtl_bio The only thing in blkcg_bio_issue_check that needs to be under rcu_read_lock is blk_throtl_bio, so move the locking there. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-throttle.c | 3 ++- include/linux/blk-cgroup.h | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 209fdd8939fb..ac0083450500 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2168,7 +2168,7 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, bool throttled = false; struct throtl_data *td = tg->td; - WARN_ON_ONCE(!rcu_read_lock_held()); + rcu_read_lock(); /* see throtl_charge_bio() */ if (bio_flagged(bio, BIO_THROTTLED)) @@ -2273,6 +2273,7 @@ out: if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + rcu_read_unlock(); return throttled; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8e86b598316c..8ab043c911f2 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -546,7 +546,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, struct blkcg_gq *blkg = bio->bi_blkg; bool throtl = false; - rcu_read_lock(); throtl = blk_throtl_bio(q, blkg, bio); if (!throtl) { struct blkg_iostat_set *bis; @@ -582,7 +581,6 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q, blkcg_bio_issue_init(bio); - rcu_read_unlock(); return !throtl; } -- cgit v1.2.3 From db18a53e5ba840993a3fc908dec648402ed740bd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:58 +0200 Subject: blk-cgroup: remove blkcg_bio_issue_check blkcg_bio_issue_check is a giant inline function that does three entirely different things. Factor out the blk-cgroup related bio initalization into a new helper, and the open code the sequence in the only caller, relying on the fact that all the actual functionality is stubbed out for non-cgroup builds. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 34 ++++++++++++++++++++++++++++ block/blk-core.c | 7 +++++- block/blk-throttle.c | 5 +++-- block/blk.h | 2 ++ include/linux/blk-cgroup.h | 56 ++-------------------------------------------- 5 files changed, 47 insertions(+), 57 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index d21ec2acd716..1ce94afc03bc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1813,6 +1813,40 @@ void bio_clone_blkg_association(struct bio *dst, struct bio *src) } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); +static int blk_cgroup_io_type(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return BLKG_IOSTAT_DISCARD; + if (op_is_write(bio->bi_opf)) + return BLKG_IOSTAT_WRITE; + return BLKG_IOSTAT_READ; +} + +void blk_cgroup_bio_start(struct bio *bio) +{ + int rwd = blk_cgroup_io_type(bio), cpu; + struct blkg_iostat_set *bis; + + cpu = get_cpu(); + bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + + /* + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split + * bio and we would have already accounted for the size of the bio. + */ + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + put_cpu(); +} + static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", diff --git a/block/blk-core.c b/block/blk-core.c index a9769c1a2875..76cfd5709f66 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1073,8 +1073,13 @@ generic_make_request_checks(struct bio *bio) if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); - if (!blkcg_bio_issue_check(q, bio)) + if (blk_throtl_bio(bio)) { + blkcg_bio_issue_init(bio); return false; + } + + blk_cgroup_bio_start(bio); + blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(q, bio); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index ac0083450500..9d00f62c05ec 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2158,9 +2158,10 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio) +bool blk_throtl_bio(struct bio *bio) { + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; diff --git a/block/blk.h b/block/blk.h index 3a120a070dac..41a50880c94e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -288,10 +288,12 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); extern void blk_throtl_register_queue(struct request_queue *q); +bool blk_throtl_bio(struct bio *bio); #else /* CONFIG_BLK_DEV_THROTTLING */ static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } #endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8ab043c911f2..431b2d18bf40 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -517,14 +517,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) -#ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio); -#else -static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio) { return false; } -#endif - bool __blkcg_punt_bio_submit(struct bio *bio); static inline bool blkcg_punt_bio_submit(struct bio *bio) @@ -540,50 +532,6 @@ static inline void blkcg_bio_issue_init(struct bio *bio) bio_issue_init(&bio->bi_issue, bio_sectors(bio)); } -static inline bool blkcg_bio_issue_check(struct request_queue *q, - struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - bool throtl = false; - - throtl = blk_throtl_bio(q, blkg, bio); - if (!throtl) { - struct blkg_iostat_set *bis; - int rwd, cpu; - - if (op_is_discard(bio->bi_opf)) - rwd = BLKG_IOSTAT_DISCARD; - else if (op_is_write(bio->bi_opf)) - rwd = BLKG_IOSTAT_WRITE; - else - rwd = BLKG_IOSTAT_READ; - - cpu = get_cpu(); - bis = per_cpu_ptr(blkg->iostat_cpu, cpu); - u64_stats_update_begin(&bis->sync); - - /* - * If the bio is flagged with BIO_CGROUP_ACCT it means this is a - * split bio and we would have already accounted for the size of - * the bio. - */ - if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { - bio_set_flag(bio, BIO_CGROUP_ACCT); - bis->cur.bytes[rwd] += bio->bi_iter.bi_size; - } - bis->cur.ios[rwd]++; - - u64_stats_update_end(&bis->sync); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu); - put_cpu(); - } - - blkcg_bio_issue_init(bio); - - return !throtl; -} - static inline void blkcg_use_delay(struct blkcg_gq *blkg) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) @@ -657,6 +605,7 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg) atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); } +void blk_cgroup_bio_start(struct bio *bio); void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay); void blkcg_maybe_throttle_current(void); @@ -710,8 +659,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { } static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } static inline void blkcg_bio_issue_init(struct bio *bio) { } -static inline bool blkcg_bio_issue_check(struct request_queue *q, - struct bio *bio) { return true; } +static inline void blk_cgroup_bio_start(struct bio *bio) { } #define blk_queue_for_each_rl(rl, q) \ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) -- cgit v1.2.3 From a2e83ef9c3da0602234357bdbfe2ac64e91906e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 27 Jun 2020 09:31:59 +0200 Subject: blk-cgroup: remove a dead check in blk_throtl_bio bios must have a valid block group by the time they are submitted. Acked-by: Tejun Heo Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-throttle.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9d00f62c05ec..ad37043297ed 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2163,7 +2163,7 @@ bool blk_throtl_bio(struct bio *bio) struct request_queue *q = bio->bi_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; -- cgit v1.2.3 From 36a3df5a4574d5ddf59804fcd0c4e9654c514d9a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 29 Jun 2020 17:47:59 +0800 Subject: blk-mq: put driver tag when this request is completed It is natural to release driver tag when this request is completed by LLD or device since its purpose is for LLD use. One big benefit is that the released tag can be re-used quicker since bio_endio() may take too long. Meantime we don't need to release driver tag for flush request. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 6 ------ block/blk-mq.c | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 15ae0155ec07..21108a550fbf 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -240,7 +240,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); flush_rq->tag = -1; } else { - blk_mq_put_driver_tag(flush_rq); flush_rq->internal_tag = -1; } @@ -341,11 +340,6 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - if (q->elevator) { - WARN_ON(rq->tag < 0); - blk_mq_put_driver_tag(rq); - } - /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-mq.c b/block/blk-mq.c index b8738b3c6d06..d07e55455726 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -674,6 +674,8 @@ bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + blk_mq_put_driver_tag(rq); + /* * For a polled request, always complete locallly, it's pointless * to redirect the completion. -- cgit v1.2.3 From 42fdc5e49c2be97db112d410d07044e0e2c7d5bb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 29 Jun 2020 17:08:34 +0200 Subject: blk-mq: remove the BLK_MQ_REQ_INTERNAL flag Just check for a non-NULL elevator directly to make the code more clear. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 4 ++-- block/blk-mq.c | 10 +++------- block/blk-mq.h | 2 +- include/linux/blk-mq.h | 2 -- 4 files changed, 6 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ae722f8b13fb..281367b04527 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -90,9 +90,9 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { - if (!(data->flags & BLK_MQ_REQ_INTERNAL) && - !hctx_may_queue(data->hctx, bt)) + if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; + if (data->shallow_depth) return __sbitmap_queue_get_shallow(bt, data->shallow_depth); else diff --git a/block/blk-mq.c b/block/blk-mq.c index d07e55455726..72d3034fe39d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -279,7 +279,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct request *rq = tags->static_rqs[tag]; req_flags_t rq_flags = 0; - if (data->flags & BLK_MQ_REQ_INTERNAL) { + if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { @@ -364,8 +364,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; - /* * Flush requests are special and go directly to the * dispatch list. Don't include reserved tags in the @@ -380,7 +378,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->flags & BLK_MQ_REQ_INTERNAL)) + if (!e) blk_mq_tag_busy(data->hctx); /* @@ -476,9 +474,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); data.ctx = __blk_mq_get_ctx(q, cpu); - if (q->elevator) - data.flags |= BLK_MQ_REQ_INTERNAL; - else + if (!q->elevator) blk_mq_tag_busy(data.hctx); ret = -EWOULDBLOCK; diff --git a/block/blk-mq.h b/block/blk-mq.h index b3ce0f3a2ad2..c6330335767c 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -159,7 +159,7 @@ struct blk_mq_alloc_data { static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->flags & BLK_MQ_REQ_INTERNAL) + if (data->q->elevator) return data->hctx->sched_tags; return data->hctx->tags; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 1641ec6cd7e5..8986e88a986b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -447,8 +447,6 @@ enum { BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), - /* allocate internal/sched tag */ - BLK_MQ_REQ_INTERNAL = (__force blk_mq_req_flags_t)(1 << 2), /* set RQF_PREEMPT */ BLK_MQ_REQ_PREEMPT = (__force blk_mq_req_flags_t)(1 << 3), }; -- cgit v1.2.3 From 65c763694398a2de63803b264dcf906b47f9d4c1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:24:56 +0800 Subject: blk-mq: pass request queue into get/put budget callback blk-mq budget is abstract from scsi's device queue depth, and it is always per-request-queue instead of hctx. It can be quite absurd to get a budget from one hctx, then dequeue a request from scheduler queue, and this request may not belong to this hctx, at least for bfq and deadline. So fix the mess and always pass request queue to get/put budget callback. Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Douglas Anderson Reviewed-by: Sagi Grimberg Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Cc: Douglas Anderson Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 8 ++++---- block/blk-mq.c | 8 ++++---- block/blk-mq.h | 12 ++++-------- drivers/scsi/scsi_lib.c | 8 +++----- include/linux/blk-mq.h | 4 ++-- 5 files changed, 17 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fdcc2c1dd178..a31e281e9d31 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -108,12 +108,12 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) break; } - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -173,12 +173,12 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the diff --git a/block/blk-mq.c b/block/blk-mq.c index 72d3034fe39d..8cdc868f4249 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1284,7 +1284,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); hctx = rq->mq_hctx; - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { + if (!got_budget && !blk_mq_get_dispatch_budget(q)) { blk_mq_put_driver_tag(rq); no_budget_avail = true; break; @@ -1299,7 +1299,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); /* * For non-shared tags, the RESTART check * will suffice. @@ -1947,11 +1947,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (q->elevator && !bypass_insert) goto insert; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!blk_mq_get_dispatch_budget(q)) goto insert; if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q); goto insert; } diff --git a/block/blk-mq.h b/block/blk-mq.h index c6330335767c..e4af193d39ac 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -179,20 +179,16 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); -static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->put_budget) - q->mq_ops->put_budget(hctx); + q->mq_ops->put_budget(q); } -static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->get_budget) - return q->mq_ops->get_budget(hctx); + return q->mq_ops->get_budget(q); return true; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6ca91d09eca1..534b85e87c80 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1597,17 +1597,15 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) +static void scsi_mq_put_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; atomic_dec(&sdev->device_busy); } -static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +static bool scsi_mq_get_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; return scsi_dev_queue_ready(q, sdev); @@ -1674,7 +1672,7 @@ out_dec_target_busy: if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: - scsi_mq_put_budget(hctx); + scsi_mq_put_budget(q); switch (ret) { case BLK_STS_OK: break; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8986e88a986b..faa340b70123 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -270,8 +270,8 @@ struct blk_mq_queue_data { typedef blk_status_t (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); typedef void (commit_rqs_fn)(struct blk_mq_hw_ctx *); -typedef bool (get_budget_fn)(struct blk_mq_hw_ctx *); -typedef void (put_budget_fn)(struct blk_mq_hw_ctx *); +typedef bool (get_budget_fn)(struct request_queue *); +typedef void (put_budget_fn)(struct request_queue *); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); -- cgit v1.2.3 From 445874e89f662c8c6a56e6bb820cbe9fbef61428 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:24:57 +0800 Subject: blk-mq: pass hctx to blk_mq_dispatch_rq_list All requests in the 'list' of blk_mq_dispatch_rq_list belong to same hctx, so it is better to pass hctx instead of request queue, because blk-mq's dispatch target is hctx instead of request queue. Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 14 ++++++-------- block/blk-mq.c | 6 +++--- block/blk-mq.h | 2 +- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a31e281e9d31..632c6f8b63f7 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -96,10 +96,9 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) struct elevator_queue *e = q->elevator; LIST_HEAD(rq_list); int ret = 0; + struct request *rq; do { - struct request *rq; - if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; @@ -131,7 +130,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, true)); return ret; } @@ -161,10 +160,9 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; + struct request *rq; do { - struct request *rq; - if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; break; @@ -200,7 +198,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, true)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -240,7 +238,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { + if (blk_mq_dispatch_rq_list(hctx, &rq_list, false)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); else @@ -253,7 +251,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list, false); + blk_mq_dispatch_rq_list(hctx, &rq_list, false); } return ret; diff --git a/block/blk-mq.c b/block/blk-mq.c index 8cdc868f4249..492aeda44da6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1258,10 +1258,10 @@ static void blk_mq_handle_zone_resource(struct request *rq, /* * Returns true if we did some work AND can potentially do more. */ -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool got_budget) { - struct blk_mq_hw_ctx *hctx; + struct request_queue *q = hctx->queue; struct request *rq, *nxt; bool no_tag = false; int errors, queued; @@ -1283,7 +1283,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); - hctx = rq->mq_hctx; + WARN_ON_ONCE(hctx != rq->mq_hctx); if (!got_budget && !blk_mq_get_dispatch_budget(q)) { blk_mq_put_driver_tag(rq); no_budget_avail = true; diff --git a/block/blk-mq.h b/block/blk-mq.h index e4af193d39ac..3fd1ed78e5e1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,7 +40,7 @@ struct blk_mq_ctx { void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -- cgit v1.2.3 From 7538352453189abf3a96461e187b1085b0ae0c6c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:24:58 +0800 Subject: blk-mq: move getting driver tag and budget into one helper Move code for getting driver tag and budget into one helper, so blk_mq_dispatch_rq_list gets a bit simplified, and easier to read. Meantime move updating of 'no_tag' and 'no_budget_available' into the branch for handling partial dispatch because that is exactly consumer of the two local variables. Also rename the parameter of 'got_budget' as 'ask_budget'. No functional change. Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 66 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 26 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 492aeda44da6..3e370f5bc40a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1255,18 +1255,50 @@ static void blk_mq_handle_zone_resource(struct request *rq, __blk_mq_requeue_request(rq); } +enum prep_dispatch { + PREP_DISPATCH_OK, + PREP_DISPATCH_NO_TAG, + PREP_DISPATCH_NO_BUDGET, +}; + +static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, + bool need_budget) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + + if (!blk_mq_get_driver_tag(rq)) { + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. + */ + if (!blk_mq_mark_tag_wait(hctx, rq)) { + blk_mq_put_dispatch_budget(rq->q); + return PREP_DISPATCH_NO_TAG; + } + } + + return PREP_DISPATCH_OK; +} + /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool got_budget) { + enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq, *nxt; - bool no_tag = false; int errors, queued; blk_status_t ret = BLK_STS_OK; - bool no_budget_avail = false; LIST_HEAD(zone_list); if (list_empty(list)) @@ -1284,31 +1316,9 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - if (!got_budget && !blk_mq_get_dispatch_budget(q)) { - blk_mq_put_driver_tag(rq); - no_budget_avail = true; + prep = blk_mq_prep_dispatch_rq(rq, !got_budget); + if (prep != PREP_DISPATCH_OK) break; - } - - if (!blk_mq_get_driver_tag(rq)) { - /* - * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. The - * waitqueue takes care of that. If the queue is run - * before we add this entry back on the dispatch list, - * we'll re-run it below. - */ - if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(q); - /* - * For non-shared tags, the RESTART check - * will suffice. - */ - if (hctx->flags & BLK_MQ_F_TAG_SHARED) - no_tag = true; - break; - } - } list_del_init(&rq->queuelist); @@ -1361,6 +1371,10 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, */ if (!list_empty(list)) { bool needs_restart; + /* For non-shared tags, the RESTART check will suffice */ + bool no_tag = prep == PREP_DISPATCH_NO_TAG && + (hctx->flags & BLK_MQ_F_TAG_SHARED); + bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; /* * If we didn't flush the entire list, we could have told -- cgit v1.2.3 From bbdb3c5d94d079ab4c6eaf9ac0d4f6813f1c0327 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:24:59 +0800 Subject: blk-mq: remove dead check from blk_mq_dispatch_rq_list When BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE is returned from .queue_rq, the 'list' variable always holds this rq which isn't queued to LLD successfully. So blk_mq_dispatch_rq_list() always returns false from the branch of '!list_empty(list)'. No functional change. Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Johannes Thumshirn Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 3e370f5bc40a..a80cb6ecc9d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1426,13 +1426,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, } else blk_mq_update_dispatch_busy(hctx, false); - /* - * If the host/device is unable to accept more work, inform the - * caller of that. - */ - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - return false; - return (queued + errors) != 0; } -- cgit v1.2.3 From 1fd40b5ea72cf830cfb932bbf791533931c615e9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:25:00 +0800 Subject: blk-mq: pass obtained budget count to blk_mq_dispatch_rq_list Pass obtained budget count to blk_mq_dispatch_rq_list(), and prepare for supporting fully batching submission. With the obtained budget count, it is easier to put extra budgets in case of .queue_rq failure. Meantime remove the old 'got_budget' parameter. Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Christoph Hellwig Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 8 ++++---- block/blk-mq.c | 31 +++++++++++++++++++++++++++---- block/blk-mq.h | 3 ++- 3 files changed, 33 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 632c6f8b63f7..4c72073830f3 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -130,7 +130,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * in blk_mq_dispatch_rq_list(). */ list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); return ret; } @@ -198,7 +198,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -238,7 +238,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(hctx, &rq_list, false)) { + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { if (has_sched_dispatch) ret = blk_mq_do_dispatch_sched(hctx); else @@ -251,7 +251,7 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) ret = blk_mq_do_dispatch_ctx(hctx); } else { blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(hctx, &rq_list, false); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); } return ret; diff --git a/block/blk-mq.c b/block/blk-mq.c index a80cb6ecc9d9..1b701e166681 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1280,7 +1280,12 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(rq->q); + /* + * All budgets not got from this function will be put + * together during handling partial dispatch + */ + if (need_budget) + blk_mq_put_dispatch_budget(rq->q); return PREP_DISPATCH_NO_TAG; } } @@ -1288,11 +1293,21 @@ static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, return PREP_DISPATCH_OK; } +/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ +static void blk_mq_release_budgets(struct request_queue *q, + unsigned int nr_budgets) +{ + int i; + + for (i = 0; i < nr_budgets; i++) + blk_mq_put_dispatch_budget(q); +} + /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, - bool got_budget) + unsigned int nr_budgets) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; @@ -1304,7 +1319,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (list_empty(list)) return false; - WARN_ON(!list_is_singular(list) && got_budget); + WARN_ON(!list_is_singular(list) && nr_budgets); /* * Now process all the entries, sending them to the driver. @@ -1316,7 +1331,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); - prep = blk_mq_prep_dispatch_rq(rq, !got_budget); + prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); if (prep != PREP_DISPATCH_OK) break; @@ -1335,6 +1350,12 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, bd.last = !blk_mq_get_driver_tag(nxt); } + /* + * once the request is queued to lld, no need to cover the + * budget any more + */ + if (nr_budgets) + nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { blk_mq_handle_dev_resource(rq, list); @@ -1376,6 +1397,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, (hctx->flags & BLK_MQ_F_TAG_SHARED); bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; + blk_mq_release_budgets(q, nr_budgets); + /* * If we didn't flush the entire list, we could have told * the driver there was more coming, but that turned out to diff --git a/block/blk-mq.h b/block/blk-mq.h index 3fd1ed78e5e1..863a2f3346d4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -40,7 +40,8 @@ struct blk_mq_ctx { void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, bool); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, + unsigned int); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, bool kick_requeue_list); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); -- cgit v1.2.3 From 6e6fcbc27e7788af54139c53537395d95560f2ef Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 18:25:01 +0800 Subject: blk-mq: support batching dispatch in case of io More and more drivers want to get batching requests queued from block layer, such as mmc, and tcp based storage drivers. Also current in-tree users have virtio-scsi, virtio-blk and nvme. For none, we already support batching dispatch. But for io scheduler, every time we just take one request from scheduler and pass the single request to blk_mq_dispatch_rq_list(). This way makes batching dispatch not possible when io scheduler is applied. One reason is that we don't want to hurt sequential IO performance, becasue IO merge chance is reduced if more requests are dequeued from scheduler queue. Try to support batching dispatch for io scheduler by starting with the following simple approach: 1) still make sure we can get budget before dequeueing request 2) use hctx->dispatch_busy to evaluate if queue is busy, if it is busy we fackback to non-batching dispatch, otherwise dequeue as many as possible requests from scheduler, and pass them to blk_mq_dispatch_rq_list(). Wrt. 2), we use similar policy for none, and turns out that SCSI SSD performance got improved much. In future, maybe we can develop more intelligent algorithem for batching dispatch. Baolin has tested this patch and found that MMC performance is improved[3]. [1] https://lore.kernel.org/linux-block/20200512075501.GF1531898@T590/#r [2] https://lore.kernel.org/linux-block/fe6bd8b9-6ed9-b225-f80c-314746133722@grimberg.me/ [3] https://lore.kernel.org/linux-block/CADBw62o9eTQDJ9RvNgEqSpXmg6Xcq=2TxH0Hfxhp29uF2W=TXA@mail.gmail.com/ Signed-off-by: Ming Lei Tested-by: Baolin Wang Reviewed-by: Christoph Hellwig Cc: Sagi Grimberg Cc: Baolin Wang Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++----- block/blk-mq.c | 2 -- 2 files changed, 82 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 4c72073830f3..1c52e56a19b1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -80,6 +81,37 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) blk_mq_run_hw_queue(hctx, true); } +static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return rqa->mq_hctx > rqb->mq_hctx; +} + +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) +{ + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; + bool ret; + + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); + return ret; +} + #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ /* @@ -90,20 +122,29 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ -static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; LIST_HEAD(rq_list); - int ret = 0; - struct request *rq; + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; do { + struct request *rq; + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { - ret = -EAGAIN; + busy = true; break; } @@ -120,7 +161,7 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * no guarantee anyone will kick the queue. Kick it * ourselves. */ - blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + run_queue = true; break; } @@ -129,8 +170,42 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + list_add_tail(&rq->queuelist, &rq_list); + if (rq->mq_hctx != hctx) + multi_hctxs = true; + } while (++count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + } while (ret == 1); return ret; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 1b701e166681..a8fac4ff6fef 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1319,8 +1319,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (list_empty(list)) return false; - WARN_ON(!list_is_singular(list) && nr_budgets); - /* * Now process all the entries, sending them to the driver. */ -- cgit v1.2.3 From 570e9b73b0af2e5381ca5343759779b8c1ed20e3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 22:03:55 +0800 Subject: blk-mq: move blk_mq_get_driver_tag into blk-mq.c blk_mq_get_driver_tag() is only used by blk-mq.c and is supposed to stay in blk-mq.c, so move it and preparing for cleanup code of get/put driver tag. Meantime hctx_may_queue() is moved to header file and it is fine since it is defined as inline always. No functional change. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq-tag.c | 58 ------------------------------------------------------ block/blk-mq-tag.h | 39 ++++++++++++++++++++++++++++-------- block/blk-mq.c | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 66 deletions(-) (limited to 'block') diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 281367b04527..32d82e23b095 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -56,37 +56,6 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) blk_mq_tag_wakeup_all(tags, false); } -/* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. - */ -static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt) -{ - unsigned int depth, users; - - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return true; - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return true; - - /* - * Don't try dividing an ant - */ - if (bt->sb.depth == 1) - return true; - - users = atomic_read(&hctx->tags->active_queues); - if (!users) - return true; - - /* - * Allow at least some tags - */ - depth = max((bt->sb.depth + users - 1) / users, 4U); - return atomic_read(&hctx->nr_active) < depth; -} - static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { @@ -191,33 +160,6 @@ found_tag: return tag + tag_offset; } -bool __blk_mq_get_driver_tag(struct request *rq) -{ - struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; - unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); - int tag; - - if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = &rq->mq_hctx->tags->breserved_tags; - tag_offset = 0; - } - - if (!hctx_may_queue(rq->mq_hctx, bt)) - return false; - tag = __sbitmap_queue_get(bt); - if (tag == BLK_MQ_NO_TAG) - return false; - - rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; - return true; -} - void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 2e4ef51cdb32..3945c7f5b944 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -51,14 +51,6 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; -bool __blk_mq_get_driver_tag(struct request *rq); -static inline bool blk_mq_get_driver_tag(struct request *rq) -{ - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); -} - extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); @@ -78,6 +70,37 @@ static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) __blk_mq_tag_idle(hctx); } +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) + return true; + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->sb.depth == 1) + return true; + + users = atomic_read(&hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->sb.depth + users - 1) / users, 4U); + return atomic_read(&hctx->nr_active) < depth; +} + /* * This helper should only be used for flush request to share tag * with the request cloned from, and both the two requests can't be diff --git a/block/blk-mq.c b/block/blk-mq.c index a8fac4ff6fef..f386ddfdcf20 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1103,6 +1103,40 @@ static inline unsigned int queued_to_index(unsigned int queued) return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } +static bool __blk_mq_get_driver_tag(struct request *rq) +{ + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; + unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + bool shared = blk_mq_tag_busy(rq->mq_hctx); + int tag; + + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { + bt = &rq->mq_hctx->tags->breserved_tags; + tag_offset = 0; + } + + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; + tag = __sbitmap_queue_get(bt); + if (tag == BLK_MQ_NO_TAG) + return false; + + rq->tag = tag + tag_offset; + if (shared) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&rq->mq_hctx->nr_active); + } + rq->mq_hctx->tags->rqs[rq->tag] = rq; + return true; +} + +static bool blk_mq_get_driver_tag(struct request *rq) +{ + if (rq->tag != BLK_MQ_NO_TAG) + return true; + return __blk_mq_get_driver_tag(rq); +} + static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { -- cgit v1.2.3 From 723bf178f158abd1ce6069cb049581b3cb003aab Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 22:03:56 +0800 Subject: blk-mq: move blk_mq_put_driver_tag() into blk-mq.c It is used by blk-mq.c only, so move it to the source file. Suggested-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 20 ++++++++++++++++++++ block/blk-mq.h | 20 -------------------- 2 files changed, 20 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index f386ddfdcf20..803d4ee3b8de 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -666,6 +666,26 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } +static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = BLK_MQ_NO_TAG; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) + return; + + __blk_mq_put_driver_tag(rq->mq_hctx, rq); +} + bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); diff --git a/block/blk-mq.h b/block/blk-mq.h index 863a2f3346d4..3661e821d460 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -193,26 +193,6 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) return true; } -static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); - rq->tag = BLK_MQ_NO_TAG; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } -} - -static inline void blk_mq_put_driver_tag(struct request *rq) -{ - if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) - return; - - __blk_mq_put_driver_tag(rq->mq_hctx, rq); -} - static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; -- cgit v1.2.3 From 37f4a24c2469a10a4c16c641671bd766e276cf9f Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 30 Jun 2020 22:03:57 +0800 Subject: blk-mq: centralise related handling into blk_mq_get_driver_tag Move .nr_active update and request assignment into blk_mq_get_driver_tag(), all are good to do during getting driver tag. Meantime blk-flush related code is simplified and flush request needn't to update the request table manually any more. Signed-off-by: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 17 ++++++----------- block/blk-mq-tag.h | 12 ------------ block/blk-mq.c | 30 ++++++++++++++---------------- block/blk.h | 5 ----- 4 files changed, 20 insertions(+), 44 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 21108a550fbf..e756db088d84 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -236,12 +236,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) error = fq->rq_status; hctx = flush_rq->mq_hctx; - if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; - } else { - flush_rq->internal_tag = -1; - } + if (!q->elevator) + flush_rq->tag = BLK_MQ_NO_TAG; + else + flush_rq->internal_tag = BLK_MQ_NO_TAG; running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); @@ -315,13 +313,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { - fq->orig_rq = first_rq; + if (!q->elevator) flush_rq->tag = first_rq->tag; - blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); - } else { + else flush_rq->internal_tag = first_rq->internal_tag; - } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 3945c7f5b944..b1acac518c4e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -/* - * This helper should only be used for flush request to share tag - * with the request cloned from, and both the two requests can't be - * in flight at the same time. The caller has to make sure the tag - * can't be freed. - */ -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag, struct request *rq) -{ - hctx->tags->rqs[tag] = rq; -} - static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 803d4ee3b8de..65e0846fd065 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; - data->hctx->tags->rqs[rq->tag] = rq; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; - rq->rq_flags = rq_flags; + rq->rq_flags = 0; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -1127,9 +1121,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); int tag; + blk_mq_tag_busy(rq->mq_hctx); + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; @@ -1142,19 +1137,22 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; return true; } static bool blk_mq_get_driver_tag(struct request *rq) { - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if (hctx->flags & BLK_MQ_F_TAG_SHARED) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + hctx->tags->rqs[rq->tag] = rq; + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, diff --git a/block/blk.h b/block/blk.h index 41a50880c94e..0184a31fe4df 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,11 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - /* - * flush_rq shares tag with this rq, both can't be active - * at the same time - */ - struct request *orig_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; -- cgit v1.2.3 From 0b8cc25d942b50abb0c7ba597bc1621b8929ab75 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 30 Jun 2020 16:54:41 +0100 Subject: blk-cgroup: clean up indentation There is a statement that is indented one level too deeply, fix it by removing a tab. Signed-off-by: Colin Ian King Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1ce94afc03bc..748c4b2a9273 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1837,7 +1837,7 @@ void blk_cgroup_bio_start(struct bio *bio) */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); - bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; -- cgit v1.2.3 From b5fc1e8bedf8ad2c6381e0df6331ad5686aca425 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 27 Apr 2020 21:12:50 +0800 Subject: blk-mq: remove pointless call of list_entry_rq() in hctx_show_busy_rq() Just use rq directly, the usage of list_entry_rq() doesn't make any sense. Signed-off-by: Hou Tao Reviewed-by: Ming Lei Reviewed-by: Bart Van Assche Signed-off-by: Jens Axboe --- block/blk-mq-debugfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index a2800bc56fb4..439b7c217638 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -401,8 +401,7 @@ static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) - __blk_mq_debugfs_rq_show(params->m, - list_entry_rq(&rq->queuelist)); + __blk_mq_debugfs_rq_show(params->m, rq); return true; } -- cgit v1.2.3 From f695ca3886ce72b027af7aa6040cd420cae2088c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:39 +0200 Subject: block: remove the request_queue argument from blk_queue_split The queue can be trivially derived from the bio, so pass one less argument. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 21 ++++++++++----------- block/blk-mq.c | 2 +- block/blk.h | 3 +-- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/ps3vram.c | 2 +- drivers/block/rsxx/dev.c | 2 +- drivers/block/umem.c | 2 +- drivers/lightnvm/pblk-init.c | 4 ++-- drivers/md/dm.c | 2 +- drivers/md/md.c | 2 +- drivers/nvme/host/multipath.c | 9 ++++----- drivers/s390/block/dcssblk.c | 2 +- drivers/s390/block/xpram.c | 2 +- include/linux/blkdev.h | 2 +- 15 files changed, 28 insertions(+), 31 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 9c9fb21584b6..20fa22906041 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -283,20 +283,20 @@ split: /** * __blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer * @bio: [in, out] bio to be split * @nr_segs: [out] number of segments in the first bio * * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @q->bio_split, it is the responsibility - * of the caller to ensure that @q is only released after processing of the + * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is + * the responsibility of the caller to ensure that + * @bio->bi_disk->queue->bio_split is only released after processing of the * split bio has finished. */ -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs) +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { + struct request_queue *q = (*bio)->bi_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -345,20 +345,19 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio, /** * blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer * @bio: [in, out] bio to be split * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @q->bio_split, it is the responsibility of the caller to - * ensure that @q is only released after processing of the split bio has - * finished. + * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of + * the caller to ensure that @bio->bi_disk->queue->bio_split is only released + * after processing of the split bio has finished. */ -void blk_queue_split(struct request_queue *q, struct bio **bio) +void blk_queue_split(struct bio **bio) { unsigned int nr_segs; - __blk_queue_split(q, bio, &nr_segs); + __blk_queue_split(bio, &nr_segs); } EXPORT_SYMBOL(blk_queue_split); diff --git a/block/blk-mq.c b/block/blk-mq.c index 65e0846fd065..dbadb7defd61 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2166,7 +2166,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) blk_status_t ret; blk_queue_bounce(q, &bio); - __blk_queue_split(q, &bio, &nr_segs); + __blk_queue_split(&bio, &nr_segs); if (!bio_integrity_prep(bio)) goto queue_exit; diff --git a/block/blk.h b/block/blk.h index 0184a31fe4df..0114fd92c8a0 100644 --- a/block/blk.h +++ b/block/blk.h @@ -220,8 +220,7 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs); +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); int ll_front_merge_fn(struct request *req, struct bio *bio, diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3f09b2ab9778..936868047422 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1598,7 +1598,7 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) struct drbd_device *device = bio->bi_disk->private_data; unsigned long start_jif; - blk_queue_split(q, &bio); + blk_queue_split(&bio); start_jif = jiffies; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 27a33adc41e4..29b0c62dc86c 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2434,7 +2434,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) char b[BDEVNAME_SIZE]; struct bio *split; - blk_queue_split(q, &bio); + blk_queue_split(&bio); pd = q->queuedata; if (!pd) { diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 5a1d1d137c72..76cc584aa763 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -593,7 +593,7 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) dev_dbg(&dev->core, "%s\n", __func__); - blk_queue_split(q, &bio); + blk_queue_split(&bio); spin_lock_irq(&priv->lock); busy = !bio_list_empty(&priv->list); diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 6a4d8d26e32c..1d52bc73dd0f 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -123,7 +123,7 @@ static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) struct rsxx_bio_meta *bio_meta; blk_status_t st = BLK_STS_IOERR; - blk_queue_split(q, &bio); + blk_queue_split(&bio); might_sleep(); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 5498f1cf36b3..3b89c07f9e9d 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -527,7 +527,7 @@ static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, bio->bi_iter.bi_size); - blk_queue_split(q, &bio); + blk_queue_split(&bio); spin_lock_irq(&card->lock); *card->biotail = bio; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 6e677ff62cc9..7a4a1b7a941d 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -63,7 +63,7 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) * constraint. Writes can be of arbitrary size. */ if (bio_data_dir(bio) == READ) { - blk_queue_split(q, &bio); + blk_queue_split(&bio); pblk_submit_read(pblk, bio); } else { /* Prevent deadlock in the case of a modest LUN configuration @@ -71,7 +71,7 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) * leaves at least 256KB available for user I/O. */ if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) - blk_queue_split(q, &bio); + blk_queue_split(&bio); pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c8d91f271c27..5aa7a604f4cb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1776,7 +1776,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, */ if (current->bio_list) { if (is_abnormal_io(bio)) - blk_queue_split(md->queue, &bio); + blk_queue_split(&bio); else dm_queue_split(md, ti, &bio); } diff --git a/drivers/md/md.c b/drivers/md/md.c index f567f536b529..ff20868e5e1b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -475,7 +475,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } - blk_queue_split(q, &bio); + blk_queue_split(&bio); if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index da78e499947a..5a5205ea570a 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -301,12 +301,11 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, int srcu_idx; /* - * The namespace might be going away and the bio might - * be moved to a different queue via blk_steal_bios(), - * so we need to use the bio_split pool from the original - * queue to allocate the bvecs from. + * The namespace might be going away and the bio might be moved to a + * different queue via blk_steal_bios(), so we need to use the bio_split + * pool from the original queue to allocate the bvecs from. */ - blk_queue_split(q, &bio); + blk_queue_split(&bio); srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 384edffe5cb4..dfe21eb72760 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -878,7 +878,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio) unsigned long source_addr; unsigned long bytes_done; - blk_queue_split(q, &bio); + blk_queue_split(&bio); bytes_done = 0; dev_info = bio->bi_disk->private_data; diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 45a04daec89e..5456f0ad5a40 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -191,7 +191,7 @@ static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) unsigned long page_addr; unsigned long bytes; - blk_queue_split(q, &bio); + blk_queue_split(&bio); if ((bio->bi_iter.bi_sector & 7) != 0 || (bio->bi_iter.bi_size & 4095) != 0) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 15497782c176..d002defc1789 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -871,7 +871,7 @@ extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); -extern void blk_queue_split(struct request_queue *, struct bio **); +extern void blk_queue_split(struct bio **); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, unsigned int, void __user *); -- cgit v1.2.3 From c8178674608679c8146235abb0be4e2c3a86f409 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:40 +0200 Subject: block: tidy up a warning in bio_check_ro The "generic_make_request: " prefix has no value, and will soon become stale. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 76cfd5709f66..95dca74534ff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -869,8 +869,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) return false; WARN_ONCE(1, - "generic_make_request: Trying to write " - "to read-only block-device %s (partno %d)\n", + "Trying to write to read-only block-device %s (partno %d)\n", bio_devname(bio, b), part->partno); /* Older lvm-tools actually trigger this */ return false; -- cgit v1.2.3 From 833f84e2b9b512c02a8d751e63e2b7de4ea2aa82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:41 +0200 Subject: block: remove the NULL queue check in generic_make_request_checks All registers disks must have a valid queue pointer, so don't bother to log a warning for that case. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 95dca74534ff..37435d0d4335 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -973,22 +973,12 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { - struct request_queue *q; + struct request_queue *q = bio->bi_disk->queue; int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; - char b[BDEVNAME_SIZE]; might_sleep(); - q = bio->bi_disk->queue; - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); - goto end_io; - } - /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. -- cgit v1.2.3 From e439ab710fb0564dccb52b0519b3d354ea2a9c50 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:42 +0200 Subject: block: remove the nr_sectors variable in generic_make_request_checks The variable is only used once, so just open code the bio_sector() there. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 37435d0d4335..28f60985dc75 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -974,7 +974,6 @@ static noinline_for_stack bool generic_make_request_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; - int nr_sectors = bio_sectors(bio); blk_status_t status = BLK_STS_IOERR; might_sleep(); @@ -1007,7 +1006,7 @@ generic_make_request_checks(struct bio *bio) if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); - if (!nr_sectors) { + if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } -- cgit v1.2.3 From c62b37d96b6eb3ec5ae4cbe00db107bf15aebc93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:43 +0200 Subject: block: move ->make_request_fn to struct block_device_operations The make_request_fn is a little weird in that it sits directly in struct request_queue instead of an operation vector. Replace it with a block_device_operations method called submit_bio (which describes much better what it does). Also remove the request_queue argument to it, as the queue can be derived pretty trivially from the bio. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.rst | 2 +- Documentation/block/writeback_cache_control.rst | 2 +- arch/m68k/emu/nfblock.c | 5 ++- arch/xtensa/platforms/iss/simdisk.c | 5 ++- block/blk-cgroup.c | 2 +- block/blk-core.c | 53 +++++++++---------------- block/blk-mq.c | 10 ++--- block/blk.h | 2 - drivers/block/brd.c | 5 ++- drivers/block/drbd/drbd_int.h | 2 +- drivers/block/drbd/drbd_main.c | 9 +++-- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/null_blk_main.c | 17 ++++++-- drivers/block/pktcdvd.c | 11 ++--- drivers/block/ps3vram.c | 15 ++++--- drivers/block/rsxx/dev.c | 7 +++- drivers/block/umem.c | 5 ++- drivers/block/zram/zram_drv.c | 11 ++--- drivers/lightnvm/core.c | 8 +--- drivers/lightnvm/pblk-init.c | 12 ++++-- drivers/md/bcache/request.c | 4 +- drivers/md/bcache/request.h | 4 +- drivers/md/bcache/super.c | 23 +++++++---- drivers/md/dm.c | 23 ++++++----- drivers/md/md.c | 5 ++- drivers/nvdimm/blk.c | 5 ++- drivers/nvdimm/btt.c | 5 ++- drivers/nvdimm/pmem.c | 5 ++- drivers/nvme/host/core.c | 1 + drivers/nvme/host/multipath.c | 5 +-- drivers/nvme/host/nvme.h | 1 + drivers/s390/block/dcssblk.c | 9 ++--- drivers/s390/block/xpram.c | 6 +-- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 7 +--- include/linux/lightnvm.h | 3 +- 36 files changed, 153 insertions(+), 140 deletions(-) (limited to 'block') diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst index b964796ec9c7..267384159bf7 100644 --- a/Documentation/block/biodoc.rst +++ b/Documentation/block/biodoc.rst @@ -1036,7 +1036,7 @@ Now the generic block layer performs partition-remapping early and thus provides drivers with a sector number relative to whole device, rather than having to take partition number into account in order to arrive at the true sector number. The routine blk_partition_remap() is invoked by -generic_make_request even before invoking the queue specific make_request_fn, +generic_make_request even before invoking the queue specific ->submit_bio, so the i/o scheduler also gets to operate on whole disk sector numbers. This should typically not require changes to block drivers, it just never gets to invoke its own partition sector offset calculations since all bios diff --git a/Documentation/block/writeback_cache_control.rst b/Documentation/block/writeback_cache_control.rst index 2c752c57c14c..b208488d0aae 100644 --- a/Documentation/block/writeback_cache_control.rst +++ b/Documentation/block/writeback_cache_control.rst @@ -47,7 +47,7 @@ the Forced Unit Access is implemented. The REQ_PREFLUSH and REQ_FUA flags may both be set on a single bio. -Implementation details for make_request_fn based block drivers +Implementation details for bio based block drivers -------------------------------------------------------------- These drivers will always see the REQ_PREFLUSH and REQ_FUA bits as they sit diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 87e8b1700acd..92d26c812441 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -59,7 +59,7 @@ struct nfhd_device { struct gendisk *disk; }; -static blk_qc_t nfhd_make_request(struct request_queue *queue, struct bio *bio) +static blk_qc_t nfhd_submit_bio(struct bio *bio) { struct nfhd_device *dev = bio->bi_disk->private_data; struct bio_vec bvec; @@ -93,6 +93,7 @@ static int nfhd_getgeo(struct block_device *bdev, struct hd_geometry *geo) static const struct block_device_operations nfhd_ops = { .owner = THIS_MODULE, + .submit_bio = nfhd_submit_bio, .getgeo = nfhd_getgeo, }; @@ -118,7 +119,7 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->queue = blk_alloc_queue(nfhd_make_request, NUMA_NO_NODE); + dev->queue = blk_alloc_queue(NUMA_NO_NODE); if (dev->queue == NULL) goto free_dev; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 31b5020077a0..5107140dbb7e 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -101,7 +101,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, spin_unlock(&dev->lock); } -static blk_qc_t simdisk_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t simdisk_submit_bio(struct bio *bio) { struct simdisk *dev = bio->bi_disk->private_data; struct bio_vec bvec; @@ -144,6 +144,7 @@ static void simdisk_release(struct gendisk *disk, fmode_t mode) static const struct block_device_operations simdisk_ops = { .owner = THIS_MODULE, + .submit_bio = simdisk_submit_bio, .open = simdisk_open, .release = simdisk_release, }; @@ -267,7 +268,7 @@ static int __init simdisk_setup(struct simdisk *dev, int which, spin_lock_init(&dev->lock); dev->users = 0; - dev->queue = blk_alloc_queue(simdisk_make_request, NUMA_NO_NODE); + dev->queue = blk_alloc_queue(NUMA_NO_NODE); if (dev->queue == NULL) { pr_err("blk_alloc_queue failed\n"); goto out_alloc_queue; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 748c4b2a9273..594f1d0b0e5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1012,7 +1012,7 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * - * Called from __blk_alloc_queue(). Responsible for initializing blkcg + * Called from blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: diff --git a/block/blk-core.c b/block/blk-core.c index 28f60985dc75..cb07a726dd71 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -283,7 +283,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure - * that its ->make_request_fn will not re-add plugging prior to calling + * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising @@ -510,7 +510,7 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *__blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; int ret; @@ -575,6 +575,7 @@ struct request_queue *__blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); + q->nr_requests = BLKDEV_MAX_RQ; return q; @@ -592,21 +593,6 @@ fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } - -struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) -{ - struct request_queue *q; - - if (WARN_ON_ONCE(!make_request)) - return NULL; - - q = __blk_alloc_queue(node_id); - if (!q) - return NULL; - q->make_request_fn = make_request; - q->nr_requests = BLKDEV_MAX_RQ; - return q; -} EXPORT_SYMBOL(blk_alloc_queue); /** @@ -1088,15 +1074,15 @@ end_io: static blk_qc_t do_make_request(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct gendisk *disk = bio->bi_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { - if (!q->make_request_fn) - return blk_mq_make_request(q, bio); - ret = q->make_request_fn(q, bio); + if (!disk->fops->submit_bio) + return blk_mq_submit_bio(bio); + ret = disk->fops->submit_bio(bio); } - blk_queue_exit(q); + blk_queue_exit(disk->queue); return ret; } @@ -1113,10 +1099,9 @@ blk_qc_t generic_make_request(struct bio *bio) { /* * bio_list_on_stack[0] contains bios submitted by the current - * make_request_fn. - * bio_list_on_stack[1] contains bios that were submitted before - * the current make_request_fn, but that haven't been processed - * yet. + * ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the + * current ->submit_bio_bio, but that haven't been processed yet. */ struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; @@ -1125,10 +1110,10 @@ blk_qc_t generic_make_request(struct bio *bio) goto out; /* - * We only want one ->make_request_fn to be active at a time, else + * We only want one ->submit_bio to be active at a time, else * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a - * make_request_fn function. current->bio_list is also used as a + * ->submit_bio method. current->bio_list is also used as a * flag to say if generic_make_request is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests @@ -1146,12 +1131,12 @@ blk_qc_t generic_make_request(struct bio *bio) * We pretend that we have just taken it off a longer list, so * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be - * added. ->make_request() may indeed add some more bios + * added. ->submit_bio() may indeed add some more bios * through a recursive call to generic_make_request. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from - * bio_list, and call into ->make_request() again. + * bio_list, and call into ->submit_bio() again. */ BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); @@ -1201,9 +1186,9 @@ EXPORT_SYMBOL(generic_make_request); */ blk_qc_t direct_make_request(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct gendisk *disk = bio->bi_disk; - if (WARN_ON_ONCE(q->make_request_fn)) { + if (WARN_ON_ONCE(!disk->queue->mq_ops)) { bio_io_error(bio); return BLK_QC_T_NONE; } @@ -1212,10 +1197,10 @@ blk_qc_t direct_make_request(struct bio *bio) if (unlikely(bio_queue_enter(bio))) return BLK_QC_T_NONE; if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(q); + blk_queue_exit(disk->queue); return BLK_QC_T_NONE; } - return blk_mq_make_request(q, bio); + return blk_mq_submit_bio(bio); } EXPORT_SYMBOL_GPL(direct_make_request); diff --git a/block/blk-mq.c b/block/blk-mq.c index dbadb7defd61..948987e9b6ab 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2136,8 +2136,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) } /** - * blk_mq_make_request - Create and send a request to block device. - * @q: Request queue pointer. + * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The @@ -2151,8 +2150,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * Returns: Request queue cookie. */ -blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t blk_mq_submit_bio(struct bio *bio) { + struct request_queue *q = bio->bi_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { @@ -2277,7 +2277,7 @@ queue_exit: blk_queue_exit(q); return BLK_QC_T_NONE; } -EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ +EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) @@ -3017,7 +3017,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, { struct request_queue *uninit_q, *q; - uninit_q = __blk_alloc_queue(set->numa_node); + uninit_q = blk_alloc_queue(set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); uninit_q->queuedata = queuedata; diff --git a/block/blk.h b/block/blk.h index 0114fd92c8a0..9dcf51c94096 100644 --- a/block/blk.h +++ b/block/blk.h @@ -419,8 +419,6 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #endif } -struct request_queue *__blk_alloc_queue(int node_id); - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2fb25c348d53..2723a70eb855 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -282,7 +282,7 @@ out: return err; } -static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t brd_submit_bio(struct bio *bio) { struct brd_device *brd = bio->bi_disk->private_data; struct bio_vec bvec; @@ -330,6 +330,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, + .submit_bio = brd_submit_bio, .rw_page = brd_rw_page, }; @@ -381,7 +382,7 @@ static struct brd_device *brd_alloc(int i) spin_lock_init(&brd->brd_lock); INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC); - brd->brd_queue = blk_alloc_queue(brd_make_request, NUMA_NO_NODE); + brd->brd_queue = blk_alloc_queue(NUMA_NO_NODE); if (!brd->brd_queue) goto out_free_dev; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 33d0831c99b6..0327408da79c 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1451,7 +1451,7 @@ extern void conn_free_crypto(struct drbd_connection *connection); /* drbd_req */ extern void do_submit(struct work_struct *ws); extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long); -extern blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio); +extern blk_qc_t drbd_submit_bio(struct bio *bio); extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req); extern int is_valid_ar_handle(struct drbd_request *, sector_t); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 26f4e0aa7393..2b05de0896e2 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -132,9 +132,10 @@ wait_queue_head_t drbd_pp_wait; DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); static const struct block_device_operations drbd_ops = { - .owner = THIS_MODULE, - .open = drbd_open, - .release = drbd_release, + .owner = THIS_MODULE, + .submit_bio = drbd_submit_bio, + .open = drbd_open, + .release = drbd_release, }; struct bio *bio_alloc_drbd(gfp_t gfp_mask) @@ -2801,7 +2802,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - q = blk_alloc_queue(drbd_make_request, NUMA_NO_NODE); + q = blk_alloc_queue(NUMA_NO_NODE); if (!q) goto out_no_q; device->rq_queue = q; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 936868047422..c7e14c9a6e5f 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1593,7 +1593,7 @@ void do_submit(struct work_struct *ws) } } -blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t drbd_submit_bio(struct bio *bio) { struct drbd_device *device = bio->bi_disk->private_data; unsigned long start_jif; diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c index 93ce0a00b2ae..907c6858aec0 100644 --- a/drivers/block/null_blk_main.c +++ b/drivers/block/null_blk_main.c @@ -1388,7 +1388,7 @@ static struct nullb_queue *nullb_to_queue(struct nullb *nullb) return &nullb->queues[index]; } -static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio) +static blk_qc_t null_submit_bio(struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector; sector_t nr_sectors = bio_sectors(bio); @@ -1575,7 +1575,13 @@ static void null_config_discard(struct nullb *nullb) blk_queue_flag_set(QUEUE_FLAG_DISCARD, nullb->q); } -static const struct block_device_operations null_ops = { +static const struct block_device_operations null_bio_ops = { + .owner = THIS_MODULE, + .submit_bio = null_submit_bio, + .report_zones = null_report_zones, +}; + +static const struct block_device_operations null_rq_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, }; @@ -1647,7 +1653,10 @@ static int null_gendisk_register(struct nullb *nullb) disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->major = null_major; disk->first_minor = nullb->index; - disk->fops = &null_ops; + if (queue_is_mq(nullb->q)) + disk->fops = &null_rq_ops; + else + disk->fops = &null_bio_ops; disk->private_data = nullb; disk->queue = nullb->q; strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); @@ -1792,7 +1801,7 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_tags; } } else if (dev->queue_mode == NULL_Q_BIO) { - nullb->q = blk_alloc_queue(null_queue_bio, dev->home_node); + nullb->q = blk_alloc_queue(dev->home_node); if (!nullb->q) { rv = -ENOMEM; goto out_cleanup_queues; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 29b0c62dc86c..5588bd4cd267 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -36,7 +36,7 @@ * block device, assembling the pieces to full packets and queuing them to the * packet I/O scheduler. * - * At the top layer there is a custom make_request_fn function that forwards + * At the top layer there is a custom ->submit_bio function that forwards * read requests directly to the iosched queue and puts write requests in the * unaligned write queue. A kernel thread performs the necessary read * gathering to convert the unaligned writes to aligned writes and then feeds @@ -2428,7 +2428,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) } } -static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t pkt_submit_bio(struct bio *bio) { struct pktcdvd_device *pd; char b[BDEVNAME_SIZE]; @@ -2436,7 +2436,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) blk_queue_split(&bio); - pd = q->queuedata; + pd = bio->bi_disk->queue->queuedata; if (!pd) { pr_err("%s incorrect request queue\n", bio_devname(bio, b)); goto end_io; @@ -2480,7 +2480,7 @@ static blk_qc_t pkt_make_request(struct request_queue *q, struct bio *bio) split = bio; } - pkt_make_request_write(q, split); + pkt_make_request_write(bio->bi_disk->queue, split); } while (split != bio); return BLK_QC_T_NONE; @@ -2685,6 +2685,7 @@ static char *pkt_devnode(struct gendisk *disk, umode_t *mode) static const struct block_device_operations pktcdvd_ops = { .owner = THIS_MODULE, + .submit_bio = pkt_submit_bio, .open = pkt_open, .release = pkt_close, .ioctl = pkt_ioctl, @@ -2749,7 +2750,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); disk->private_data = pd; - disk->queue = blk_alloc_queue(pkt_make_request, NUMA_NO_NODE); + disk->queue = blk_alloc_queue(NUMA_NO_NODE); if (!disk->queue) goto out_mem2; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 76cc584aa763..1088798c8dd0 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -90,12 +90,6 @@ struct ps3vram_priv { static int ps3vram_major; - -static const struct block_device_operations ps3vram_fops = { - .owner = THIS_MODULE, -}; - - #define DMA_NOTIFIER_HANDLE_BASE 0x66604200 /* first DMA notifier handle */ #define DMA_NOTIFIER_OFFSET_BASE 0x1000 /* first DMA notifier offset */ #define DMA_NOTIFIER_SIZE 0x40 @@ -585,7 +579,7 @@ out: return next; } -static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t ps3vram_submit_bio(struct bio *bio) { struct ps3_system_bus_device *dev = bio->bi_disk->private_data; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); @@ -610,6 +604,11 @@ static blk_qc_t ps3vram_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static const struct block_device_operations ps3vram_fops = { + .owner = THIS_MODULE, + .submit_bio = ps3vram_submit_bio, +}; + static int ps3vram_probe(struct ps3_system_bus_device *dev) { struct ps3vram_priv *priv; @@ -737,7 +736,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - queue = blk_alloc_queue(ps3vram_make_request, NUMA_NO_NODE); + queue = blk_alloc_queue(NUMA_NO_NODE); if (!queue) { dev_err(&dev->core, "blk_alloc_queue failed\n"); error = -ENOMEM; diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 1d52bc73dd0f..edacefff6e35 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -50,6 +50,8 @@ struct rsxx_bio_meta { static struct kmem_cache *bio_meta_pool; +static blk_qc_t rsxx_submit_bio(struct bio *bio); + /*----------------- Block Device Operations -----------------*/ static int rsxx_blkdev_ioctl(struct block_device *bdev, fmode_t mode, @@ -92,6 +94,7 @@ static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo) static const struct block_device_operations rsxx_fops = { .owner = THIS_MODULE, + .submit_bio = rsxx_submit_bio, .getgeo = rsxx_getgeo, .ioctl = rsxx_blkdev_ioctl, }; @@ -117,7 +120,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, } } -static blk_qc_t rsxx_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t rsxx_submit_bio(struct bio *bio) { struct rsxx_cardinfo *card = bio->bi_disk->private_data; struct rsxx_bio_meta *bio_meta; @@ -233,7 +236,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) return -ENOMEM; } - card->queue = blk_alloc_queue(rsxx_make_request, NUMA_NO_NODE); + card->queue = blk_alloc_queue(NUMA_NO_NODE); if (!card->queue) { dev_err(CARD_TO_DEV(card), "Failed queue alloc\n"); unregister_blkdev(card->major, DRIVER_NAME); diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 3b89c07f9e9d..2b95d7b33b91 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -519,7 +519,7 @@ static int mm_check_plugged(struct cardinfo *card) return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } -static blk_qc_t mm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t mm_submit_bio(struct bio *bio) { struct cardinfo *card = bio->bi_disk->private_data; @@ -779,6 +779,7 @@ static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo) static const struct block_device_operations mm_fops = { .owner = THIS_MODULE, + .submit_bio = mm_submit_bio, .getgeo = mm_getgeo, .revalidate_disk = mm_revalidate, }; @@ -886,7 +887,7 @@ static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) card->biotail = &card->bio; spin_lock_init(&card->lock); - card->queue = blk_alloc_queue(mm_make_request, NUMA_NO_NODE); + card->queue = blk_alloc_queue(NUMA_NO_NODE); if (!card->queue) goto failed_alloc; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0564e3f38408..f9a57f147ee1 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -793,9 +793,9 @@ static void zram_sync_read(struct work_struct *work) } /* - * Block layer want one ->make_request_fn to be active at a time - * so if we use chained IO with parent IO in same context, - * it's a deadlock. To avoid, it, it uses worker thread context. + * Block layer want one ->submit_bio to be active at a time, so if we use + * chained IO with parent IO in same context, it's a deadlock. To avoid that, + * use a worker thread context. */ static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec, unsigned long entry, struct bio *bio) @@ -1584,7 +1584,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) /* * Handler function for all zram I/O requests. */ -static blk_qc_t zram_make_request(struct request_queue *queue, struct bio *bio) +static blk_qc_t zram_submit_bio(struct bio *bio) { struct zram *zram = bio->bi_disk->private_data; @@ -1813,6 +1813,7 @@ static int zram_open(struct block_device *bdev, fmode_t mode) static const struct block_device_operations zram_devops = { .open = zram_open, + .submit_bio = zram_submit_bio, .swap_slot_free_notify = zram_slot_free_notify, .rw_page = zram_rw_page, .owner = THIS_MODULE @@ -1891,7 +1892,7 @@ static int zram_add(void) #ifdef CONFIG_ZRAM_WRITEBACK spin_lock_init(&zram->wb_limit_lock); #endif - queue = blk_alloc_queue(zram_make_request, NUMA_NO_NODE); + queue = blk_alloc_queue(NUMA_NO_NODE); if (!queue) { pr_err("Error allocating disk queue for device %d\n", device_id); diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index db38a68abb6c..fe78bf0fdce5 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -236,10 +236,6 @@ err_dev: return tgt_dev; } -static const struct block_device_operations nvm_fops = { - .owner = THIS_MODULE, -}; - static struct nvm_tgt_type *__nvm_find_target_type(const char *name) { struct nvm_tgt_type *tt; @@ -380,7 +376,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) goto err_dev; } - tqueue = blk_alloc_queue(tt->make_rq, dev->q->node); + tqueue = blk_alloc_queue(dev->q->node); if (!tqueue) { ret = -ENOMEM; goto err_disk; @@ -390,7 +386,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) tdisk->flags = GENHD_FL_EXT_DEVT; tdisk->major = 0; tdisk->first_minor = 0; - tdisk->fops = &nvm_fops; + tdisk->fops = tt->bops; tdisk->queue = tqueue; targetdata = tt->init(tgt_dev, tdisk, create->flags); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 7a4a1b7a941d..b6246f73895c 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -47,9 +47,9 @@ static struct pblk_global_caches pblk_caches = { struct bio_set pblk_bio_set; -static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) +static blk_qc_t pblk_submit_bio(struct bio *bio) { - struct pblk *pblk = q->queuedata; + struct pblk *pblk = bio->bi_disk->queue->queuedata; if (bio_op(bio) == REQ_OP_DISCARD) { pblk_discard(pblk, bio); @@ -79,6 +79,12 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static const struct block_device_operations pblk_bops = { + .owner = THIS_MODULE, + .submit_bio = pblk_submit_bio, +}; + + static size_t pblk_trans_map_size(struct pblk *pblk) { int entry_size = 8; @@ -1280,7 +1286,7 @@ static struct nvm_tgt_type tt_pblk = { .name = "pblk", .version = {1, 0, 0}, - .make_rq = pblk_make_rq, + .bops = &pblk_bops, .capacity = pblk_capacity, .init = pblk_init, diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 7acf024e99f3..fc5702b10074 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1158,7 +1158,7 @@ static void quit_max_writeback_rate(struct cache_set *c, /* Cached devices - read & write stuff */ -blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; struct bcache_device *d = bio->bi_disk->private_data; @@ -1291,7 +1291,7 @@ static void flash_dev_nodata(struct closure *cl) continue_at(cl, search_free, NULL); } -blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio) +blk_qc_t flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h index bb005c93dd72..82b38366a95d 100644 --- a/drivers/md/bcache/request.h +++ b/drivers/md/bcache/request.h @@ -37,10 +37,10 @@ unsigned int bch_get_congested(const struct cache_set *c); void bch_data_insert(struct closure *cl); void bch_cached_dev_request_init(struct cached_dev *dc); -blk_qc_t cached_dev_make_request(struct request_queue *q, struct bio *bio); +blk_qc_t cached_dev_submit_bio(struct bio *bio); void bch_flash_dev_request_init(struct bcache_device *d); -blk_qc_t flash_dev_make_request(struct request_queue *q, struct bio *bio); +blk_qc_t flash_dev_submit_bio(struct bio *bio); extern struct kmem_cache *bch_search_cache; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 21aa168113d3..de13f6e91696 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -680,7 +680,16 @@ static int ioctl_dev(struct block_device *b, fmode_t mode, return d->ioctl(d, mode, cmd, arg); } -static const struct block_device_operations bcache_ops = { +static const struct block_device_operations bcache_cached_ops = { + .submit_bio = cached_dev_submit_bio, + .open = open_dev, + .release = release_dev, + .ioctl = ioctl_dev, + .owner = THIS_MODULE, +}; + +static const struct block_device_operations bcache_flash_ops = { + .submit_bio = flash_dev_submit_bio, .open = open_dev, .release = release_dev, .ioctl = ioctl_dev, @@ -820,8 +829,8 @@ static void bcache_device_free(struct bcache_device *d) } static int bcache_device_init(struct bcache_device *d, unsigned int block_size, - sector_t sectors, make_request_fn make_request_fn, - struct block_device *cached_bdev) + sector_t sectors, struct block_device *cached_bdev, + const struct block_device_operations *ops) { struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, @@ -868,10 +877,10 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->major = bcache_major; d->disk->first_minor = idx_to_first_minor(idx); - d->disk->fops = &bcache_ops; + d->disk->fops = ops; d->disk->private_data = d; - q = blk_alloc_queue(make_request_fn, NUMA_NO_NODE); + q = blk_alloc_queue(NUMA_NO_NODE); if (!q) return -ENOMEM; @@ -1355,7 +1364,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) ret = bcache_device_init(&dc->disk, block_size, dc->bdev->bd_part->nr_sects - dc->sb.data_offset, - cached_dev_make_request, dc->bdev); + dc->bdev, &bcache_cached_ops); if (ret) return ret; @@ -1468,7 +1477,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) kobject_init(&d->kobj, &bch_flash_dev_ktype); if (bcache_device_init(d, block_bytes(c), u->sectors, - flash_dev_make_request, NULL)) + NULL, &bcache_flash_ops)) goto err; bcache_device_attach(d, c, u - c->uuids); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5aa7a604f4cb..5acfaba3700d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1770,7 +1770,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, } /* - * If in ->make_request_fn we need to use blk_queue_split(), otherwise + * If in ->queue_bio we need to use blk_queue_split(), otherwise * queue_limits for abnormal requests (e.g. discard, writesame, etc) * won't be imposed. */ @@ -1787,7 +1787,7 @@ static blk_qc_t dm_process_bio(struct mapped_device *md, return __split_and_process_bio(md, map, bio); } -static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t dm_submit_bio(struct bio *bio) { struct mapped_device *md = bio->bi_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; @@ -1798,12 +1798,12 @@ static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) /* * We are called with a live reference on q_usage_counter, but * that one will be released as soon as we return. Grab an - * extra one as blk_mq_make_request expects to be able to - * consume a reference (which lives until the request is freed - * in case a request is allocated). + * extra one as blk_mq_submit_bio expects to be able to consume + * a reference (which lives until the request is freed in case a + * request is allocated). */ - percpu_ref_get(&q->q_usage_counter); - return blk_mq_make_request(q, bio); + percpu_ref_get(&bio->bi_disk->queue->q_usage_counter); + return blk_mq_submit_bio(bio); } map = dm_get_live_table(md, &srcu_idx); @@ -1988,11 +1988,11 @@ static struct mapped_device *alloc_dev(int minor) spin_lock_init(&md->uevent_lock); /* - * default to bio-based required ->make_request_fn until DM - * table is loaded and md->type established. If request-based - * table is loaded: blk-mq will override accordingly. + * default to bio-based until DM table is loaded and md->type + * established. If request-based table is loaded: blk-mq will + * override accordingly. */ - md->queue = blk_alloc_queue(dm_make_request, numa_node_id); + md->queue = blk_alloc_queue(numa_node_id); if (!md->queue) goto bad; @@ -3232,6 +3232,7 @@ static const struct pr_ops dm_pr_ops = { }; static const struct block_device_operations dm_blk_dops = { + .submit_bio = dm_submit_bio, .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, diff --git a/drivers/md/md.c b/drivers/md/md.c index ff20868e5e1b..7b7cb49be351 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -463,7 +463,7 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); -static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); const int sgrp = op_stat_group(bio_op(bio)); @@ -5641,7 +5641,7 @@ static int md_alloc(dev_t dev, char *name) mddev->hold_active = UNTIL_STOP; error = -ENOMEM; - mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); + mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) goto abort; @@ -7823,6 +7823,7 @@ static int md_revalidate(struct gendisk *disk) static const struct block_device_operations md_fops = { .owner = THIS_MODULE, + .submit_bio = md_submit_bio, .open = md_open, .release = md_release, .ioctl = md_ioctl, diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 39030a324d7f..1f718381a045 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -162,7 +162,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk, return err; } -static blk_qc_t nd_blk_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t nd_blk_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip; struct nd_namespace_blk *nsblk = bio->bi_disk->private_data; @@ -225,6 +225,7 @@ static int nsblk_rw_bytes(struct nd_namespace_common *ndns, static const struct block_device_operations nd_blk_fops = { .owner = THIS_MODULE, + .submit_bio = nd_blk_submit_bio, .revalidate_disk = nvdimm_revalidate_disk, }; @@ -250,7 +251,7 @@ static int nsblk_attach_disk(struct nd_namespace_blk *nsblk) internal_nlba = div_u64(nsblk->size, nsblk_internal_lbasize(nsblk)); available_disk_size = internal_nlba * nsblk_sector_size(nsblk); - q = blk_alloc_queue(nd_blk_make_request, NUMA_NO_NODE); + q = blk_alloc_queue(NUMA_NO_NODE); if (!q) return -ENOMEM; if (devm_add_action_or_reset(dev, nd_blk_release_queue, q)) diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 48e9d169b6f9..412d21d8f643 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1439,7 +1439,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, return ret; } -static blk_qc_t btt_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct btt *btt = bio->bi_disk->private_data; @@ -1512,6 +1512,7 @@ static int btt_getgeo(struct block_device *bd, struct hd_geometry *geo) static const struct block_device_operations btt_fops = { .owner = THIS_MODULE, + .submit_bio = btt_submit_bio, .rw_page = btt_rw_page, .getgeo = btt_getgeo, .revalidate_disk = nvdimm_revalidate_disk, @@ -1523,7 +1524,7 @@ static int btt_blk_init(struct btt *btt) struct nd_namespace_common *ndns = nd_btt->ndns; /* create a new disk and request queue for btt */ - btt->btt_queue = blk_alloc_queue(btt_make_request, NUMA_NO_NODE); + btt->btt_queue = blk_alloc_queue(NUMA_NO_NODE); if (!btt->btt_queue) return -ENOMEM; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index d25e66fd942d..94790e6e0e4c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -189,7 +189,7 @@ static blk_status_t pmem_do_write(struct pmem_device *pmem, return rc; } -static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t pmem_submit_bio(struct bio *bio) { int ret = 0; blk_status_t rc = 0; @@ -281,6 +281,7 @@ __weak long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, static const struct block_device_operations pmem_fops = { .owner = THIS_MODULE, + .submit_bio = pmem_submit_bio, .rw_page = pmem_rw_page, .revalidate_disk = nvdimm_revalidate_disk, }; @@ -423,7 +424,7 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - q = blk_alloc_queue(pmem_make_request, dev_to_node(dev)); + q = blk_alloc_queue(dev_to_node(dev)); if (!q) return -ENOMEM; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6810c8812aed..5192a024dc1b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2178,6 +2178,7 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) const struct block_device_operations nvme_ns_head_ops = { .owner = THIS_MODULE, + .submit_bio = nvme_ns_head_submit_bio, .open = nvme_ns_head_open, .release = nvme_ns_head_release, .ioctl = nvme_ioctl, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 5a5205ea570a..89afcf943bf8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -291,8 +291,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) return false; } -static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, - struct bio *bio) +blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { struct nvme_ns_head *head = bio->bi_disk->private_data; struct device *dev = disk_to_dev(head->disk); @@ -374,7 +373,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || !multipath) return 0; - q = blk_alloc_queue(nvme_ns_head_make_request, ctrl->numa_node); + q = blk_alloc_queue(ctrl->numa_node); if (!q) goto out; blk_queue_flag_set(QUEUE_FLAG_NONROT, q); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 61780c38f51f..9f2b0e0b4558 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -586,6 +586,7 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl); bool nvme_mpath_clear_current_path(struct nvme_ns *ns); void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl); struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); +blk_qc_t nvme_ns_head_submit_bio(struct bio *bio); static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index dfe21eb72760..35666c3537de 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -31,8 +31,7 @@ static int dcssblk_open(struct block_device *bdev, fmode_t mode); static void dcssblk_release(struct gendisk *disk, fmode_t mode); -static blk_qc_t dcssblk_make_request(struct request_queue *q, - struct bio *bio); +static blk_qc_t dcssblk_submit_bio(struct bio *bio); static long dcssblk_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn); @@ -41,6 +40,7 @@ static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; static int dcssblk_major; static const struct block_device_operations dcssblk_devops = { .owner = THIS_MODULE, + .submit_bio = dcssblk_submit_bio, .open = dcssblk_open, .release = dcssblk_release, }; @@ -651,8 +651,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char } dev_info->gd->major = dcssblk_major; dev_info->gd->fops = &dcssblk_devops; - dev_info->dcssblk_queue = - blk_alloc_queue(dcssblk_make_request, NUMA_NO_NODE); + dev_info->dcssblk_queue = blk_alloc_queue(NUMA_NO_NODE); dev_info->gd->queue = dev_info->dcssblk_queue; dev_info->gd->private_data = dev_info; blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); @@ -868,7 +867,7 @@ dcssblk_release(struct gendisk *disk, fmode_t mode) } static blk_qc_t -dcssblk_make_request(struct request_queue *q, struct bio *bio) +dcssblk_submit_bio(struct bio *bio) { struct dcssblk_dev_info *dev_info; struct bio_vec bvec; diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 5456f0ad5a40..c2536f7767b3 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -182,7 +182,7 @@ static unsigned long xpram_highest_page_index(void) /* * Block device make request function. */ -static blk_qc_t xpram_make_request(struct request_queue *q, struct bio *bio) +static blk_qc_t xpram_submit_bio(struct bio *bio) { xpram_device_t *xdev = bio->bi_disk->private_data; struct bio_vec bvec; @@ -250,6 +250,7 @@ static int xpram_getgeo(struct block_device *bdev, struct hd_geometry *geo) static const struct block_device_operations xpram_devops = { .owner = THIS_MODULE, + .submit_bio = xpram_submit_bio, .getgeo = xpram_getgeo, }; @@ -343,8 +344,7 @@ static int __init xpram_setup_blkdev(void) xpram_disks[i] = alloc_disk(1); if (!xpram_disks[i]) goto out; - xpram_queues[i] = blk_alloc_queue(xpram_make_request, - NUMA_NO_NODE); + xpram_queues[i] = blk_alloc_queue(NUMA_NO_NODE); if (!xpram_queues[i]) { put_disk(xpram_disks[i]); goto out; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index faa340b70123..23230c1d031e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -596,6 +596,6 @@ static inline void blk_mq_cleanup_rq(struct request *rq) rq->q->mq_ops->cleanup_rq(rq); } -blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio); +blk_qc_t blk_mq_submit_bio(struct bio *bio); #endif diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d002defc1789..083ffc5bc51b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -286,8 +286,6 @@ static inline unsigned short req_get_ioprio(struct request *req) struct blk_queue_ctx; -typedef blk_qc_t (make_request_fn) (struct request_queue *q, struct bio *bio); - struct bio_vec; enum blk_eh_timer_return { @@ -398,8 +396,6 @@ struct request_queue { struct blk_queue_stats *stats; struct rq_qos *rq_qos; - make_request_fn *make_request_fn; - const struct blk_mq_ops *mq_ops; /* sw queues */ @@ -1162,7 +1158,7 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, extern void blk_dump_rq_flags(struct request *, char *); bool __must_check blk_get_queue(struct request_queue *); -struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id); +struct request_queue *blk_alloc_queue(int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); @@ -1778,6 +1774,7 @@ static inline void blk_ksm_unregister(struct request_queue *q) { } struct block_device_operations { + blk_qc_t (*submit_bio) (struct bio *bio); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ee8ec2e68055..1db223710b28 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -631,7 +631,6 @@ static inline int nvm_next_ppa_in_chk(struct nvm_tgt_dev *dev, return last; } -typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, int flags); @@ -650,7 +649,7 @@ struct nvm_tgt_type { int flags; /* target entry points */ - nvm_tgt_make_rq_fn *make_rq; + const struct block_device_operations *bops; nvm_tgt_capacity_fn *capacity; /* module-specific init/teardown */ -- cgit v1.2.3 From ed00aabd5eb9fb44d6aff1173234a2e911b9fead Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:44 +0200 Subject: block: rename generic_make_request to submit_bio_noacct generic_make_request has always been very confusingly misnamed, so rename it to submit_bio_noacct to make it clear that it is submit_bio minus accounting and a few checks. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- Documentation/block/biodoc.rst | 2 +- Documentation/fault-injection/fault-injection.rst | 2 +- Documentation/trace/ftrace.rst | 4 +-- block/bio.c | 14 +++++----- block/blk-core.c | 32 +++++++++++------------ block/blk-crypto-fallback.c | 2 +- block/blk-crypto.c | 2 +- block/blk-merge.c | 2 +- block/blk-throttle.c | 4 +-- block/bounce.c | 2 +- drivers/block/drbd/drbd_int.h | 6 ++--- drivers/block/drbd/drbd_main.c | 2 +- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/lightnvm/pblk-read.c | 2 +- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/btree.c | 2 +- drivers/md/bcache/request.c | 7 +++-- drivers/md/dm-cache-target.c | 6 ++--- drivers/md/dm-clone-target.c | 10 +++---- drivers/md/dm-crypt.c | 6 ++--- drivers/md/dm-delay.c | 2 +- drivers/md/dm-era-target.c | 2 +- drivers/md/dm-integrity.c | 4 +-- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm-snap.c | 6 ++--- drivers/md/dm-thin.c | 4 +-- drivers/md/dm-verity-target.c | 2 +- drivers/md/dm-writecache.c | 2 +- drivers/md/dm-zoned-target.c | 2 +- drivers/md/dm.c | 10 +++---- drivers/md/md-faulty.c | 4 +-- drivers/md/md-linear.c | 4 +-- drivers/md/md-multipath.c | 4 +-- drivers/md/raid0.c | 8 +++--- drivers/md/raid1.c | 14 +++++----- drivers/md/raid10.c | 28 ++++++++++---------- drivers/md/raid5.c | 10 +++---- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 2 +- 44 files changed, 115 insertions(+), 118 deletions(-) (limited to 'block') diff --git a/Documentation/block/biodoc.rst b/Documentation/block/biodoc.rst index 267384159bf7..afda5e30a82e 100644 --- a/Documentation/block/biodoc.rst +++ b/Documentation/block/biodoc.rst @@ -1036,7 +1036,7 @@ Now the generic block layer performs partition-remapping early and thus provides drivers with a sector number relative to whole device, rather than having to take partition number into account in order to arrive at the true sector number. The routine blk_partition_remap() is invoked by -generic_make_request even before invoking the queue specific ->submit_bio, +submit_bio_noacct even before invoking the queue specific ->submit_bio, so the i/o scheduler also gets to operate on whole disk sector numbers. This should typically not require changes to block drivers, it just never gets to invoke its own partition sector offset calculations since all bios diff --git a/Documentation/fault-injection/fault-injection.rst b/Documentation/fault-injection/fault-injection.rst index f51bb21d20e4..f850ad018b70 100644 --- a/Documentation/fault-injection/fault-injection.rst +++ b/Documentation/fault-injection/fault-injection.rst @@ -24,7 +24,7 @@ Available fault injection capabilities injects disk IO errors on devices permitted by setting /sys/block//make-it-fail or - /sys/block///make-it-fail. (generic_make_request()) + /sys/block///make-it-fail. (submit_bio_noacct()) - fail_mmc_request diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst index 430a16283103..80ba765a8237 100644 --- a/Documentation/trace/ftrace.rst +++ b/Documentation/trace/ftrace.rst @@ -1453,7 +1453,7 @@ function-trace, we get a much larger output:: => __blk_run_queue_uncond => __blk_run_queue => blk_queue_bio - => generic_make_request + => submit_bio_noacct => submit_bio => submit_bh => __ext3_get_inode_loc @@ -1738,7 +1738,7 @@ tracers. => __blk_run_queue_uncond => __blk_run_queue => blk_queue_bio - => generic_make_request + => submit_bio_noacct => submit_bio => submit_bh => ext3_bread diff --git a/block/bio.c b/block/bio.c index fc1299f9d86a..ef91782fd668 100644 --- a/block/bio.c +++ b/block/bio.c @@ -358,7 +358,7 @@ static void bio_alloc_rescue(struct work_struct *work) if (!bio) break; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -416,19 +416,19 @@ static void punt_bios_to_rescuer(struct bio_set *bs) * submit the previously allocated bio for IO before attempting to allocate * a new one. Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under generic_make_request() (i.e. any block + * Note that when running under submit_bio_noacct() (i.e. any block * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent + * submit_bio_noacct() that converts recursion into iteration, to prevent * stack overflows. * * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have + * submit_bio_noacct() would be susceptible to deadlocks, but we have * deadlock avoidance code that resubmits any blocked bios from a rescuer * thread. * * However, we do not guarantee forward progress for allocations from other * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad * for per bio allocations. * * RETURNS: @@ -457,14 +457,14 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, nr_iovecs > 0)) return NULL; /* - * generic_make_request() converts recursion to iteration; this + * submit_bio_noacct() converts recursion to iteration; this * means if we're running beneath it, any bios we allocate and * submit will not be submitted (and thus freed) until after we * return. * * This exposes us to a potential deadlock if we allocate * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate + * underneath submit_bio_noacct(). If we were to allocate * multiple bios (say a stacking block driver that was splitting * bios), we would deadlock if we exhausted the mempool's * reserve. diff --git a/block/blk-core.c b/block/blk-core.c index cb07a726dd71..ff9a88d2d244 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -956,8 +956,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -static noinline_for_stack bool -generic_make_request_checks(struct bio *bio) +static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; blk_status_t status = BLK_STS_IOERR; @@ -985,9 +984,8 @@ generic_make_request_checks(struct bio *bio) } /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. + * Filter flush bio's early so that bio based drivers without flush + * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { @@ -1072,7 +1070,7 @@ end_io: return false; } -static blk_qc_t do_make_request(struct bio *bio) +static blk_qc_t __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_disk; blk_qc_t ret = BLK_QC_T_NONE; @@ -1087,7 +1085,7 @@ static blk_qc_t do_make_request(struct bio *bio) } /** - * generic_make_request - re-submit a bio to the block device layer for I/O + * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is @@ -1095,7 +1093,7 @@ static blk_qc_t do_make_request(struct bio *bio) * systems and other upper level users of the block layer should use * submit_bio() instead. */ -blk_qc_t generic_make_request(struct bio *bio) +blk_qc_t submit_bio_noacct(struct bio *bio) { /* * bio_list_on_stack[0] contains bios submitted by the current @@ -1106,7 +1104,7 @@ blk_qc_t generic_make_request(struct bio *bio) struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; - if (!generic_make_request_checks(bio)) + if (!submit_bio_checks(bio)) goto out; /* @@ -1114,7 +1112,7 @@ blk_qc_t generic_make_request(struct bio *bio) * stack usage with stacked devices could be a problem. So use * current->bio_list to keep a list of requests submited by a * ->submit_bio method. current->bio_list is also used as a - * flag to say if generic_make_request is currently active in this + * flag to say if submit_bio_noacct is currently active in this * task or not. If it is NULL, then no make_request is active. If * it is non-NULL, then a make_request is active, and new requests * should be added at the tail @@ -1132,7 +1130,7 @@ blk_qc_t generic_make_request(struct bio *bio) * we assign bio_list to a pointer to the bio_list_on_stack, * thus initialising the bio_list of new bios to be * added. ->submit_bio() may indeed add some more bios - * through a recursive call to generic_make_request. If it + * through a recursive call to submit_bio_noacct. If it * did, we find a non-NULL value in bio_list and re-enter the loop * from the top. In this case we really did just take the bio * of the top of the list (no pretending) and so remove it from @@ -1150,7 +1148,7 @@ blk_qc_t generic_make_request(struct bio *bio) /* Create a fresh bio_list for all subordinate requests */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = do_make_request(bio); + ret = __submit_bio(bio); /* sort new bios into those for a lower level * and those for the same level @@ -1174,13 +1172,13 @@ blk_qc_t generic_make_request(struct bio *bio) out: return ret; } -EXPORT_SYMBOL(generic_make_request); +EXPORT_SYMBOL(submit_bio_noacct); /** * direct_make_request - hand a buffer directly to its device driver for I/O * @bio: The bio describing the location in memory and on the device. * - * This function behaves like generic_make_request(), but does not protect + * This function behaves like submit_bio_noacct(), but does not protect * against recursion. Must only be used if the called driver is known * to be blk-mq based. */ @@ -1192,7 +1190,7 @@ blk_qc_t direct_make_request(struct bio *bio) bio_io_error(bio); return BLK_QC_T_NONE; } - if (!generic_make_request_checks(bio)) + if (!submit_bio_checks(bio)) return BLK_QC_T_NONE; if (unlikely(bio_queue_enter(bio))) return BLK_QC_T_NONE; @@ -1263,13 +1261,13 @@ blk_qc_t submit_bio(struct bio *bio) blk_qc_t ret; psi_memstall_enter(&pflags); - ret = generic_make_request(bio); + ret = submit_bio_noacct(bio); psi_memstall_leave(&pflags); return ret; } - return generic_make_request(bio); + return submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 6e49688a2d80..c162b754efbd 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -228,7 +228,7 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) return false; } bio_chain(split_bio, bio); - generic_make_request(bio); + submit_bio_noacct(bio); *bio_ptr = split_bio; } diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 6533c9b36ab8..2d5e60023b08 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -239,7 +239,7 @@ void __blk_crypto_free_request(struct request *rq) * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via - * generic_make_request. A bounce bio will be allocated to encrypt the contents + * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * diff --git a/block/blk-merge.c b/block/blk-merge.c index 20fa22906041..5196dc145270 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -338,7 +338,7 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) bio_chain(split, *bio); trace_block_split(q, split, (*bio)->bi_iter.bi_sector); - generic_make_request(*bio); + submit_bio_noacct(*bio); *bio = split; } } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index ad37043297ed..fee3325edf27 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1339,8 +1339,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while((bio = bio_list_pop(&bio_list_on_stack))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bio_list_on_stack))) + submit_bio_noacct(bio); blk_finish_plug(&plug); } } diff --git a/block/bounce.c b/block/bounce.c index c3aaed070124..431be88a0240 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -309,7 +309,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, if (!passthrough && sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); - generic_make_request(*bio_orig); + submit_bio_noacct(*bio_orig); *bio_orig = bio; } bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 0327408da79c..fe6cb99eb917 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1576,12 +1576,12 @@ void drbd_set_my_capacity(struct drbd_device *device, sector_t size); /* * used to submit our private bio */ -static inline void drbd_generic_make_request(struct drbd_device *device, +static inline void drbd_submit_bio_noacct(struct drbd_device *device, int fault_type, struct bio *bio) { __release(local); if (!bio->bi_disk) { - drbd_err(device, "drbd_generic_make_request: bio->bi_disk == NULL\n"); + drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; @@ -1590,7 +1590,7 @@ static inline void drbd_generic_make_request(struct drbd_device *device, if (drbd_insert_fault(device, fault_type)) bio_io_error(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2b05de0896e2..7c34cc0ad8cc 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2325,7 +2325,7 @@ static void do_retry(struct work_struct *ws) * workqueues instead. */ - /* We are not just doing generic_make_request(), + /* We are not just doing submit_bio_noacct(), * as we want to keep the start_time information. */ inc_ap_bio(device); __drbd_make_request(device, bio, start_jif); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 3a3f2b6a821f..c74f561b4eab 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1723,7 +1723,7 @@ next_bio: bios = bios->bi_next; bio->bi_next = NULL; - drbd_generic_make_request(device, fault_type, bio); + drbd_submit_bio_noacct(device, fault_type, bio); } while (bios); return 0; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index c7e14c9a6e5f..674be09b2da9 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1164,7 +1164,7 @@ drbd_submit_req_private_bio(struct drbd_request *req) else if (bio_op(bio) == REQ_OP_DISCARD) drbd_process_discard_or_zeroes_req(req, EE_TRIM); else - generic_make_request(bio); + submit_bio_noacct(bio); put_ldev(device); } else bio_io_error(bio); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 2b89c9f2ca70..7c903de5c4e1 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1525,7 +1525,7 @@ int w_restart_disk_io(struct drbd_work *w, int cancel) drbd_req_make_private_bio(req, req->master_bio); bio_set_dev(req->private_bio, device->ldev->backing_bdev); - generic_make_request(req->private_bio); + submit_bio_noacct(req->private_bio); return 0; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 5588bd4cd267..4becc1efe775 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -913,7 +913,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } atomic_inc(&pd->cdrw.pending_bios); - generic_make_request(bio); + submit_bio_noacct(bio); } } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 140927ebf41e..c28537a489bc 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -320,7 +320,7 @@ split_retry: split_bio = bio_split(bio, nr_secs * NR_PHY_IN_LOG, GFP_KERNEL, &pblk_bio_set); bio_chain(split_bio, bio); - generic_make_request(bio); + submit_bio_noacct(bio); /* New bio contains first N sectors of the previous one, so * we can continue to use existing rqd, but we need to shrink diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 221e0191b687..3c708e8b5e2d 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -929,7 +929,7 @@ static inline void closure_bio_submit(struct cache_set *c, bio_endio(bio); return; } - generic_make_request(bio); + submit_bio_noacct(bio); } /* diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 6548a601edf0..d5c51e332046 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -959,7 +959,7 @@ err: * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * - * If IO is necessary and running under generic_make_request, returns -EAGAIN. + * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index fc5702b10074..dd012ebface0 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1115,7 +1115,7 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio) !blk_queue_discard(bdev_get_queue(dc->bdev))) bio->bi_end_io(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } static void quit_max_writeback_rate(struct cache_set *c, @@ -1197,7 +1197,7 @@ blk_qc_t cached_dev_submit_bio(struct bio *bio) if (!bio->bi_iter.bi_size) { /* * can't call bch_journal_meta from under - * generic_make_request + * submit_bio_noacct */ continue_at_nobarrier(&s->cl, cached_dev_nodata, @@ -1311,8 +1311,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) if (!bio->bi_iter.bi_size) { /* - * can't call bch_journal_meta from under - * generic_make_request + * can't call bch_journal_meta from under submit_bio_noacct */ continue_at_nobarrier(&s->cl, flash_dev_nodata, diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index d3bb355819a4..9eccced92896 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -886,7 +886,7 @@ static void accounted_complete(struct cache *cache, struct bio *bio) static void accounted_request(struct cache *cache, struct bio *bio) { accounted_begin(cache, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } static void issue_op(struct bio *bio, void *context) @@ -1792,7 +1792,7 @@ static bool process_bio(struct cache *cache, struct bio *bio) bool commit_needed; if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED) - generic_make_request(bio); + submit_bio_noacct(bio); return commit_needed; } @@ -1858,7 +1858,7 @@ static bool process_discard_bio(struct cache *cache, struct bio *bio) if (cache->features.discard_passdown) { remap_to_origin(cache, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } else bio_endio(bio); diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 5ce96ddf1ce1..59ed8a67c2e3 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -330,7 +330,7 @@ static void submit_bios(struct bio_list *bios) blk_start_plug(&plug); while ((bio = bio_list_pop(bios))) - generic_make_request(bio); + submit_bio_noacct(bio); blk_finish_plug(&plug); } @@ -346,7 +346,7 @@ static void submit_bios(struct bio_list *bios) static void issue_bio(struct clone *clone, struct bio *bio) { if (!bio_triggers_commit(clone, bio)) { - generic_make_request(bio); + submit_bio_noacct(bio); return; } @@ -473,7 +473,7 @@ static void complete_discard_bio(struct clone *clone, struct bio *bio, bool succ bio_region_range(clone, bio, &rs, &nr_regions); trim_bio(bio, region_to_sector(clone, rs), nr_regions << clone->region_shift); - generic_make_request(bio); + submit_bio_noacct(bio); } else bio_endio(bio); } @@ -865,7 +865,7 @@ static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio bio->bi_private = hd; atomic_inc(&hd->clone->hydrations_in_flight); - generic_make_request(bio); + submit_bio_noacct(bio); } /* @@ -1281,7 +1281,7 @@ static void process_deferred_flush_bios(struct clone *clone) */ bio_endio(bio); } else { - generic_make_request(bio); + submit_bio_noacct(bio); } } } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 000ddfab5ba0..ad324abb8c49 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1789,7 +1789,7 @@ static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) return 1; } - generic_make_request(clone); + submit_bio_noacct(clone); return 0; } @@ -1815,7 +1815,7 @@ static void kcryptd_io_write(struct dm_crypt_io *io) { struct bio *clone = io->ctx.bio_out; - generic_make_request(clone); + submit_bio_noacct(clone); } #define crypt_io_from_node(node) rb_entry((node), struct dm_crypt_io, rb_node) @@ -1893,7 +1893,7 @@ static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) clone->bi_iter.bi_sector = cc->start + io->sector; if (likely(!async) && test_bit(DM_CRYPT_NO_OFFLOAD, &cc->flags)) { - generic_make_request(clone); + submit_bio_noacct(clone); return; } diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f496213f8b67..2628a832787b 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -72,7 +72,7 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index bdb84b8e7162..566ddbdb16a4 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1265,7 +1265,7 @@ static void process_deferred_bios(struct era *era) bio_io_error(bio); else while ((bio = bio_list_pop(&marked_bios))) - generic_make_request(bio); + submit_bio_noacct(bio); } static void process_rpc_calls(struct era *era) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 81dc5ff08909..ae866e469e1b 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -2115,12 +2115,12 @@ offload_to_thread: dio->in_flight = (atomic_t)ATOMIC_INIT(1); dio->completion = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); return; } - generic_make_request(bio); + submit_bio_noacct(bio); if (need_sync_io) { wait_for_completion_io(&read_comp); diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 78cff42d987e..73bb23de6336 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -677,7 +677,7 @@ static void process_queued_bios(struct work_struct *work) bio_endio(bio); break; case DM_MAPIO_REMAPPED: - generic_make_request(bio); + submit_bio_noacct(bio); break; case DM_MAPIO_SUBMITTED: break; diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 2f655d9f4200..fa09bc4e4c54 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -779,7 +779,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes) wakeup_mirrord(ms); } else { map_bio(get_default_mirror(ms), bio); - generic_make_request(bio); + submit_bio_noacct(bio); } } } diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index 963d3774c93e..2d1d4a4c399c 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -252,7 +252,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int op, /* * Issue the synchronous I/O from a different thread - * to avoid generic_make_request recursion. + * to avoid submit_bio_noacct recursion. */ INIT_WORK_ONSTACK(&req.work, do_metadata); queue_work(ps->metadata_wq, &req.work); diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 6b11a266299f..4668b2cd98f4 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -1568,7 +1568,7 @@ static void flush_bios(struct bio *bio) while (bio) { n = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } @@ -1588,7 +1588,7 @@ static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio) bio->bi_next = NULL; r = do_origin(s->origin, bio, false); if (r == DM_MAPIO_REMAPPED) - generic_make_request(bio); + submit_bio_noacct(bio); bio = n; } } @@ -1829,7 +1829,7 @@ static void start_full_bio(struct dm_snap_pending_exception *pe, bio->bi_end_io = full_bio_end_io; bio->bi_private = callback_data; - generic_make_request(bio); + submit_bio_noacct(bio); } static struct dm_snap_pending_exception * diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index fa8d5464c1fb..fe2de2888709 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -758,7 +758,7 @@ static void issue(struct thin_c *tc, struct bio *bio) struct pool *pool = tc->pool; if (!bio_triggers_commit(tc, bio)) { - generic_make_request(bio); + submit_bio_noacct(bio); return; } @@ -2394,7 +2394,7 @@ static void process_deferred_bios(struct pool *pool) if (bio->bi_opf & REQ_PREFLUSH) bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); } } diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index eec9f252e935..75fa4d9b7617 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -681,7 +681,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio) verity_submit_prefetch(v, io); - generic_make_request(bio); + submit_bio_noacct(bio); return DM_MAPIO_SUBMITTED; } diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 74f3c506f084..62421554b838 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -1238,7 +1238,7 @@ static int writecache_flush_thread(void *data) bio_end_sector(bio)); wc_unlock(wc); bio_set_dev(bio, wc->dev->bdev); - generic_make_request(bio); + submit_bio_noacct(bio); } else { writecache_flush(wc); wc_unlock(wc); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index a907a9446c0b..05a3cfefe937 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -140,7 +140,7 @@ static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone, bio_advance(bio, clone->bi_iter.bi_size); refcount_inc(&bioctx->ref); - generic_make_request(clone); + submit_bio_noacct(clone); if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone)) zone->wp_block += nr_blocks; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5acfaba3700d..b32b539dbace 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1305,7 +1305,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) if (md->type == DM_TYPE_NVME_BIO_BASED) ret = direct_make_request(clone); else - ret = generic_make_request(clone); + ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: free_tio(tio); @@ -1652,7 +1652,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, error = __split_and_process_non_flush(&ci); if (current->bio_list && ci.sector_count && !error) { /* - * Remainder must be passed to generic_make_request() + * Remainder must be passed to submit_bio_noacct() * so that it gets handled *after* bios already submitted * have been completely processed. * We take a clone of the original to store in @@ -1677,7 +1677,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, bio_chain(b, bio); trace_block_split(md->queue, b, bio->bi_iter.bi_sector); - ret = generic_make_request(bio); + ret = submit_bio_noacct(bio); break; } } @@ -1745,7 +1745,7 @@ static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struc bio_chain(split, *bio); trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector); - generic_make_request(*bio); + submit_bio_noacct(*bio); *bio = split; } } @@ -2500,7 +2500,7 @@ static void dm_wq_work(struct work_struct *work) break; if (dm_request_based(md)) - (void) generic_make_request(c); + (void) submit_bio_noacct(c); else (void) dm_process_bio(md, map, c); } diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index 50ad4ba86f0e..fda4cb3f936f 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -169,7 +169,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) if (bio_data_dir(bio) == WRITE) { /* write request */ if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't generic_make_request, + /* special case - don't decrement, don't submit_bio_noacct, * just fail immediately */ bio_io_error(bio); @@ -214,7 +214,7 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio) } else bio_set_dev(bio, conf->rdev->bdev); - generic_make_request(bio); + submit_bio_noacct(bio); return true; } diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 26c75c0199fa..8efada3ee16f 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -267,7 +267,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, end_sector - bio_sector, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; } @@ -286,7 +286,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); + submit_bio_noacct(bio); } return true; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 152f9e65a226..277fdfd9ee54 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -131,7 +131,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_private = mp_bh; mddev_check_writesame(mddev, &mp_bh->bio); mddev_check_write_zeroes(mddev, &mp_bh->bio); - generic_make_request(&mp_bh->bio); + submit_bio_noacct(&mp_bh->bio); return true; } @@ -348,7 +348,7 @@ static void multipathd(struct md_thread *thread) bio->bi_opf |= REQ_FAILFAST_TRANSPORT; bio->bi_end_io = multipath_end_request; bio->bi_private = mp_bh; - generic_make_request(bio); + submit_bio_noacct(bio); } } spin_unlock_irqrestore(&conf->device_lock, flags); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 322386ff5d22..e9e91c8d8afc 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -495,7 +495,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; end = zone->zone_end; } else @@ -559,7 +559,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(rdev->bdev), discard_bio, disk_devt(mddev->gendisk), bio->bi_iter.bi_sector); - generic_make_request(discard_bio); + submit_bio_noacct(discard_bio); } bio_endio(bio); } @@ -600,7 +600,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) struct bio *split = bio_split(bio, sectors, GFP_NOIO, &mddev->bio_set); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; } @@ -633,7 +633,7 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); + submit_bio_noacct(bio); return true; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index dcd27f3da84e..2aa2649cca66 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -834,7 +834,7 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; cond_resched(); } @@ -1312,7 +1312,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; @@ -1338,7 +1338,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r1_bio->sector); - generic_make_request(read_bio); + submit_bio_noacct(read_bio); } static void raid1_write_request(struct mddev *mddev, struct bio *bio, @@ -1483,7 +1483,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, bio); - generic_make_request(bio); + submit_bio_noacct(bio); bio = split; r1_bio->master_bio = bio; r1_bio->sectors = max_sectors; @@ -2240,7 +2240,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) atomic_inc(&r1_bio->remaining); md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio)); - generic_make_request(wbio); + submit_bio_noacct(wbio); } put_sync_write_buf(r1_bio, 1); @@ -2926,7 +2926,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; - generic_make_request(bio); + submit_bio_noacct(bio); } } } else { @@ -2935,7 +2935,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr, md_sync_acct_bio(bio, nr_sectors); if (read_targets == 1) bio->bi_opf &= ~MD_FAILFAST; - generic_make_request(bio); + submit_bio_noacct(bio); } return nr_sectors; } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ec136e44aef7..e45fd56cf584 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -917,7 +917,7 @@ static void flush_pending_writes(struct r10conf *conf) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; } blk_finish_plug(&plug); @@ -1102,7 +1102,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) /* Just ignore it */ bio_endio(bio); else - generic_make_request(bio); + submit_bio_noacct(bio); bio = next; } kfree(plug); @@ -1194,7 +1194,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, gfp, &conf->bio_split); bio_chain(split, bio); allow_barrier(conf); - generic_make_request(bio); + submit_bio_noacct(bio); wait_barrier(conf); bio = split; r10_bio->master_bio = bio; @@ -1221,7 +1221,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, disk_devt(mddev->gendisk), r10_bio->sector); - generic_make_request(read_bio); + submit_bio_noacct(read_bio); return; } @@ -1479,7 +1479,7 @@ retry_write: GFP_NOIO, &conf->bio_split); bio_chain(split, bio); allow_barrier(conf); - generic_make_request(bio); + submit_bio_noacct(bio); wait_barrier(conf); bio = split; r10_bio->master_bio = bio; @@ -2099,7 +2099,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) tbio->bi_opf |= MD_FAILFAST; tbio->bi_iter.bi_sector += conf->mirrors[d].rdev->data_offset; bio_set_dev(tbio, conf->mirrors[d].rdev->bdev); - generic_make_request(tbio); + submit_bio_noacct(tbio); } /* Now write out to any replacement devices @@ -2118,7 +2118,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) atomic_inc(&r10_bio->remaining); md_sync_acct(conf->mirrors[d].replacement->bdev, bio_sectors(tbio)); - generic_make_request(tbio); + submit_bio_noacct(tbio); } done: @@ -2241,7 +2241,7 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) wbio = r10_bio->devs[1].bio; wbio2 = r10_bio->devs[1].repl_bio; /* Need to test wbio2->bi_end_io before we call - * generic_make_request as if the former is NULL, + * submit_bio_noacct as if the former is NULL, * the latter is free to free wbio2. */ if (wbio2 && !wbio2->bi_end_io) @@ -2249,13 +2249,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) if (wbio->bi_end_io) { atomic_inc(&conf->mirrors[d].rdev->nr_pending); md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio)); - generic_make_request(wbio); + submit_bio_noacct(wbio); } if (wbio2) { atomic_inc(&conf->mirrors[d].replacement->nr_pending); md_sync_acct(conf->mirrors[d].replacement->bdev, bio_sectors(wbio2)); - generic_make_request(wbio2); + submit_bio_noacct(wbio2); } } @@ -2889,7 +2889,7 @@ static void raid10_set_cluster_sync_high(struct r10conf *conf) * a number of r10_bio structures, one for each out-of-sync device. * As we setup these structures, we collect all bio's together into a list * which we then process collectively to add pages, and then process again - * to pass to generic_make_request. + * to pass to submit_bio_noacct. * * The r10_bio structures are linked using a borrowed master_bio pointer. * This link is counted in ->remaining. When the r10_bio that points to NULL @@ -3496,7 +3496,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, if (bio->bi_end_io == end_sync_read) { md_sync_acct_bio(bio, nr_sectors); bio->bi_status = 0; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -4654,7 +4654,7 @@ read_more: md_sync_acct_bio(read_bio, r10_bio->sectors); atomic_inc(&r10_bio->remaining); read_bio->bi_next = NULL; - generic_make_request(read_bio); + submit_bio_noacct(read_bio); sectors_done += nr_sectors; if (sector_nr <= last) goto read_more; @@ -4717,7 +4717,7 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) md_sync_acct_bio(b, r10_bio->sectors); atomic_inc(&r10_bio->remaining); b->bi_next = NULL; - generic_make_request(b); + submit_bio_noacct(b); } end_reshape_request(r10_bio); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab8067f9ce8c..8dea4398b191 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -873,7 +873,7 @@ static void dispatch_bio_list(struct bio_list *tmp) struct bio *bio; while ((bio = bio_list_pop(tmp))) - generic_make_request(bio); + submit_bio_noacct(bio); } static int cmp_stripe(void *priv, struct list_head *a, struct list_head *b) @@ -1151,7 +1151,7 @@ again: if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else - generic_make_request(bi); + submit_bio_noacct(bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1201,7 +1201,7 @@ again: if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else - generic_make_request(rbi); + submit_bio_noacct(rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -5289,7 +5289,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) trace_block_bio_remap(align_bi->bi_disk->queue, align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); - generic_make_request(align_bi); + submit_bio_noacct(align_bi); return 1; } else { rcu_read_unlock(); @@ -5309,7 +5309,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) struct r5conf *conf = mddev->private; split = bio_split(raid_bio, sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, raid_bio); - generic_make_request(raid_bio); + submit_bio_noacct(raid_bio); raid_bio = split; } diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 89afcf943bf8..f07fa47c251d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -351,7 +351,7 @@ static void nvme_requeue_work(struct work_struct *work) * path. */ bio->bi_disk = head->disk; - generic_make_request(bio); + submit_bio_noacct(bio); } } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 083ffc5bc51b..b73cfa6a5141 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -852,7 +852,7 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); -extern blk_qc_t generic_make_request(struct bio *bio); +blk_qc_t submit_bio_noacct(struct bio *bio); extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); -- cgit v1.2.3 From 566acf2daaf2048c692abfc2342ab4a49b59ee40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:45 +0200 Subject: block: refator submit_bio_noacct Split out a __submit_bio_noacct helper for the actual de-recursion algorithm, and simplify the loop by using a continue when we can't enter the queue for a bio. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 143 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 75 insertions(+), 68 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index ff9a88d2d244..57b5dc00d44c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1084,6 +1084,74 @@ static blk_qc_t __submit_bio(struct bio *bio) return ret; } +/* + * The loop in this function may be a bit non-obvious, and so deserves some + * explanation: + * + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure + * that), so we have a list with a single bio. + * - We pretend that we have just taken it off a longer list, so we assign + * bio_list to a pointer to the bio_list_on_stack, thus initialising the + * bio_list of new bios to be added. ->submit_bio() may indeed add some more + * bios through a recursive call to submit_bio_noacct. If it did, we find a + * non-NULL value in bio_list and re-enter the loop from the top. + * - In this case we really did just take the bio of the top of the list (no + * pretending) and so remove it from bio_list, and call into ->submit_bio() + * again. + * + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the current + * ->submit_bio_bio, but that haven't been processed yet. + */ +static blk_qc_t __submit_bio_noacct(struct bio *bio) +{ + struct bio_list bio_list_on_stack[2]; + blk_qc_t ret = BLK_QC_T_NONE; + + BUG_ON(bio->bi_next); + + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; + + do { + struct request_queue *q = bio->bi_disk->queue; + struct bio_list lower, same; + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; + + /* + * Create a fresh bio_list for all subordinate requests. + */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); + + ret = __submit_bio(bio); + + /* + * Sort new bios into those for a lower level and those for the + * same level. + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bio->bi_disk->queue) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + + /* + * Now assemble so we handle the lowest level first. + */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + + current->bio_list = NULL; + return ret; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -1095,82 +1163,21 @@ static blk_qc_t __submit_bio(struct bio *bio) */ blk_qc_t submit_bio_noacct(struct bio *bio) { - /* - * bio_list_on_stack[0] contains bios submitted by the current - * ->submit_bio. - * bio_list_on_stack[1] contains bios that were submitted before the - * current ->submit_bio_bio, but that haven't been processed yet. - */ - struct bio_list bio_list_on_stack[2]; - blk_qc_t ret = BLK_QC_T_NONE; - if (!submit_bio_checks(bio)) - goto out; + return BLK_QC_T_NONE; /* - * We only want one ->submit_bio to be active at a time, else - * stack usage with stacked devices could be a problem. So use - * current->bio_list to keep a list of requests submited by a - * ->submit_bio method. current->bio_list is also used as a - * flag to say if submit_bio_noacct is currently active in this - * task or not. If it is NULL, then no make_request is active. If - * it is non-NULL, then a make_request is active, and new requests - * should be added at the tail + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list + * to collect a list of requests submited by a ->submit_bio method while + * it is active, and then process them after it returned. */ if (current->bio_list) { bio_list_add(¤t->bio_list[0], bio); - goto out; + return BLK_QC_T_NONE; } - /* following loop may be a bit non-obvious, and so deserves some - * explanation. - * Before entering the loop, bio->bi_next is NULL (as all callers - * ensure that) so we have a list with a single bio. - * We pretend that we have just taken it off a longer list, so - * we assign bio_list to a pointer to the bio_list_on_stack, - * thus initialising the bio_list of new bios to be - * added. ->submit_bio() may indeed add some more bios - * through a recursive call to submit_bio_noacct. If it - * did, we find a non-NULL value in bio_list and re-enter the loop - * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so remove it from - * bio_list, and call into ->submit_bio() again. - */ - BUG_ON(bio->bi_next); - bio_list_init(&bio_list_on_stack[0]); - current->bio_list = bio_list_on_stack; - do { - struct request_queue *q = bio->bi_disk->queue; - - if (likely(bio_queue_enter(bio) == 0)) { - struct bio_list lower, same; - - /* Create a fresh bio_list for all subordinate requests */ - bio_list_on_stack[1] = bio_list_on_stack[0]; - bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); - - /* sort new bios into those for a lower level - * and those for the same level - */ - bio_list_init(&lower); - bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) - bio_list_add(&same, bio); - else - bio_list_add(&lower, bio); - /* now assemble so we handle the lowest level first */ - bio_list_merge(&bio_list_on_stack[0], &lower); - bio_list_merge(&bio_list_on_stack[0], &same); - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); - } - bio = bio_list_pop(&bio_list_on_stack[0]); - } while (bio); - current->bio_list = NULL; /* deactivate */ - -out: - return ret; + return __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); -- cgit v1.2.3 From ff93ea0ce7636345df1bf77a2be09dbb59f4517e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:46 +0200 Subject: block: shortcut __submit_bio_noacct for blk-mq drivers For blk-mq drivers bios can only be inserted for the same queue. So bypass the complicated sorting logic in __submit_bio_noacct with a blk-mq simpler submission helper. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 57b5dc00d44c..2ff166f0d24e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1152,6 +1152,34 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) return ret; } +static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +{ + struct gendisk *disk = bio->bi_disk; + struct bio_list bio_list; + blk_qc_t ret = BLK_QC_T_NONE; + + bio_list_init(&bio_list); + current->bio_list = &bio_list; + + do { + WARN_ON_ONCE(bio->bi_disk != disk); + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; + + if (!blk_crypto_bio_prep(&bio)) { + blk_queue_exit(disk->queue); + ret = BLK_QC_T_NONE; + continue; + } + + ret = blk_mq_submit_bio(bio); + } while ((bio = bio_list_pop(&bio_list))); + + current->bio_list = NULL; + return ret; +} + /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. @@ -1177,6 +1205,8 @@ blk_qc_t submit_bio_noacct(struct bio *bio) return BLK_QC_T_NONE; } + if (!bio->bi_disk->fops->submit_bio) + return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); -- cgit v1.2.3 From 5a6c35f9af416114588298aa7a90b15bbed15a41 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 10:59:47 +0200 Subject: block: remove direct_make_request Now that submit_bio_noacct has a decent blk-mq fast path there is no more need for this bypass. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 28 ---------------------------- drivers/md/dm.c | 5 +---- drivers/nvme/host/multipath.c | 2 +- include/linux/blkdev.h | 1 - 4 files changed, 2 insertions(+), 34 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 2ff166f0d24e..bf882b8d8445 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1211,34 +1211,6 @@ blk_qc_t submit_bio_noacct(struct bio *bio) } EXPORT_SYMBOL(submit_bio_noacct); -/** - * direct_make_request - hand a buffer directly to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * This function behaves like submit_bio_noacct(), but does not protect - * against recursion. Must only be used if the called driver is known - * to be blk-mq based. - */ -blk_qc_t direct_make_request(struct bio *bio) -{ - struct gendisk *disk = bio->bi_disk; - - if (WARN_ON_ONCE(!disk->queue->mq_ops)) { - bio_io_error(bio); - return BLK_QC_T_NONE; - } - if (!submit_bio_checks(bio)) - return BLK_QC_T_NONE; - if (unlikely(bio_queue_enter(bio))) - return BLK_QC_T_NONE; - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(disk->queue); - return BLK_QC_T_NONE; - } - return blk_mq_submit_bio(bio); -} -EXPORT_SYMBOL_GPL(direct_make_request); - /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b32b539dbace..2cb33896198c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1302,10 +1302,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) /* the bio has been remapped so dispatch it */ trace_block_bio_remap(clone->bi_disk->queue, clone, bio_dev(io->orig_bio), sector); - if (md->type == DM_TYPE_NVME_BIO_BASED) - ret = direct_make_request(clone); - else - ret = submit_bio_noacct(clone); + ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: free_tio(tio); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index f07fa47c251d..a986ac52c4cc 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -314,7 +314,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) trace_block_bio_remap(bio->bi_disk->queue, bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); - ret = direct_make_request(bio); + ret = submit_bio_noacct(bio); } else if (nvme_available_path(head)) { dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n"); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b73cfa6a5141..1cc913ffdbe2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -853,7 +853,6 @@ static inline void rq_flush_dcache_pages(struct request *rq) extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); blk_qc_t submit_bio_noacct(struct bio *bio); -extern blk_qc_t direct_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern struct request *blk_get_request(struct request_queue *, unsigned int op, -- cgit v1.2.3 From 6e2fa4dd683a22a7697e7ff51dad499406094d28 Mon Sep 17 00:00:00 2001 From: Hongnan Li Date: Wed, 1 Jul 2020 16:09:38 +0800 Subject: blk-iolatency: only call ktime_get() if needed ktime_to_ns(ktime_get()), which is expensive, does not need to be called if blk_iolatency_enabled() return false in blkcg_iolatency_done_bio(). Postponing ktime_to_ns(ktime_get()) execution reduces the CPU usage when blk_iolatency is disabled. Signed-off-by: Hongnan Li Signed-off-by: Jens Axboe --- block/blk-iolatency.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c128d50cb410..f90429cf4edf 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -591,7 +591,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; - u64 now = ktime_to_ns(ktime_get()); + u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); bool enabled = false; int inflight = 0; @@ -608,6 +608,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!enabled) return; + now = ktime_to_ns(ktime_get()); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { -- cgit v1.2.3 From 4e2f62e566b5bddec00682f36a404e05a3b64b6a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Jul 2020 22:58:32 -0600 Subject: Revert "blk-mq: put driver tag when this request is completed" This reverts commits the following commits: 37f4a24c2469a10a4c16c641671bd766e276cf9f 723bf178f158abd1ce6069cb049581b3cb003aab 36a3df5a4574d5ddf59804fcd0c4e9654c514d9a The last one is the culprit, but we have to go a bit deeper to get this to revert cleanly. There's been a report that this breaks some MMC setups [1], and also causes an issue with swap [2]. Until this can be figured out, revert the offending commits. [1] https://lore.kernel.org/linux-block/57fb09b1-54ba-f3aa-f82c-d709b0e6b281@samsung.com/ [2] https://lore.kernel.org/linux-block/20200702043721.GA1087@lca.pw/ Reported-by: Marek Szyprowski Reported-by: Qian Cai Signed-off-by: Jens Axboe --- block/blk-flush.c | 23 +++++++++++++++++------ block/blk-mq-tag.h | 12 ++++++++++++ block/blk-mq.c | 52 ++++++++++++++++------------------------------------ block/blk-mq.h | 20 ++++++++++++++++++++ block/blk.h | 5 +++++ 5 files changed, 70 insertions(+), 42 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index e756db088d84..15ae0155ec07 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -236,10 +236,13 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) error = fq->rq_status; hctx = flush_rq->mq_hctx; - if (!q->elevator) - flush_rq->tag = BLK_MQ_NO_TAG; - else - flush_rq->internal_tag = BLK_MQ_NO_TAG; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag(flush_rq); + flush_rq->internal_tag = -1; + } running = &fq->flush_queue[fq->flush_running_idx]; BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); @@ -313,10 +316,13 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) + if (!q->elevator) { + fq->orig_rq = first_rq; flush_rq->tag = first_rq->tag; - else + blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); + } else { flush_rq->internal_tag = first_rq->internal_tag; + } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); @@ -335,6 +341,11 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag(rq); + } + /* * After populating an empty queue, kick it to avoid stall. Read * the comment in flush_end_io(). diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index b1acac518c4e..3945c7f5b944 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -101,6 +101,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } +/* + * This helper should only be used for flush request to share tag + * with the request cloned from, and both the two requests can't be + * in flight at the same time. The caller has to make sure the tag + * can't be freed. + */ +static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, + unsigned int tag, struct request *rq) +{ + hctx->tags->rqs[tag] = rq; +} + static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 948987e9b6ab..abcf590f6238 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -277,20 +277,26 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; + req_flags_t rq_flags = 0; if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { + if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { + rq_flags = RQF_MQ_INFLIGHT; + atomic_inc(&data->hctx->nr_active); + } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; + data->hctx->tags->rqs[rq->tag] = rq; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; - rq->rq_flags = 0; + rq->rq_flags = rq_flags; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -660,32 +666,10 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } -static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, - struct request *rq) -{ - blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); - rq->tag = BLK_MQ_NO_TAG; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } -} - -static inline void blk_mq_put_driver_tag(struct request *rq) -{ - if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) - return; - - __blk_mq_put_driver_tag(rq->mq_hctx, rq); -} - bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - blk_mq_put_driver_tag(rq); - /* * For a polled request, always complete locallly, it's pointless * to redirect the completion. @@ -1121,10 +1105,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + bool shared = blk_mq_tag_busy(rq->mq_hctx); int tag; - blk_mq_tag_busy(rq->mq_hctx); - if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; @@ -1137,22 +1120,19 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; + if (shared) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&rq->mq_hctx->nr_active); + } + rq->mq_hctx->tags->rqs[rq->tag] = rq; return true; } static bool blk_mq_get_driver_tag(struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) - return false; - - if (hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&hctx->nr_active); - } - hctx->tags->rqs[rq->tag] = rq; - return true; + if (rq->tag != BLK_MQ_NO_TAG) + return true; + return __blk_mq_get_driver_tag(rq); } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, diff --git a/block/blk-mq.h b/block/blk-mq.h index 3661e821d460..863a2f3346d4 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -193,6 +193,26 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) return true; } +static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = BLK_MQ_NO_TAG; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + atomic_dec(&hctx->nr_active); + } +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) + return; + + __blk_mq_put_driver_tag(rq->mq_hctx, rq); +} + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; diff --git a/block/blk.h b/block/blk.h index 9dcf51c94096..94f7c084f68f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,6 +25,11 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; + /* + * flush_rq shares tag with this rq, both can't be active + * at the same time + */ + struct request *orig_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; -- cgit v1.2.3 From 7c792f33c1b3e231cfd50fea589194c593a6ce3c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Jul 2020 21:21:25 +0200 Subject: block: initialize current->bio_list[1] in __submit_bio_noacct_mq bio_alloc_bioset references current->bio_list[1], so we need to initialize it for the blk-mq submission path as well. Fixes: ff93ea0ce763 ("block: shortcut __submit_bio_noacct for blk-mq drivers") Reported-by: Qian Cai Reported-by: Naresh Kamboju Signed-off-by: Christoph Hellwig Tested-by: Naresh Kamboju Signed-off-by: Jens Axboe --- block/blk-core.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index bf882b8d8445..9f1bf8658b61 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1155,11 +1155,10 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct gendisk *disk = bio->bi_disk; - struct bio_list bio_list; + struct bio_list bio_list[2] = { }; blk_qc_t ret = BLK_QC_T_NONE; - bio_list_init(&bio_list); - current->bio_list = &bio_list; + current->bio_list = bio_list; do { WARN_ON_ONCE(bio->bi_disk != disk); @@ -1174,7 +1173,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) } ret = blk_mq_submit_bio(bio); - } while ((bio = bio_list_pop(&bio_list))); + } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; return ret; -- cgit v1.2.3 From 0e6e255e7a58cdf4ee4163f83deeb5ce4946051e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Jul 2020 19:45:03 +0200 Subject: block: remove a bogus warning in __submit_bio_noacct_mq If blk_mq_submit_bio flushes the plug list, bios for other disks can show up on current->bio_list. As that doesn't involve any stacking of block device it is entirely harmless and we should not warn about this case. Fixes: ff93ea0ce763 ("block: shortcut __submit_bio_noacct for blk-mq drivers") Reported-by: kernel test robot Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9f1bf8658b61..93104c7470e8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1154,14 +1154,13 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { - struct gendisk *disk = bio->bi_disk; struct bio_list bio_list[2] = { }; blk_qc_t ret = BLK_QC_T_NONE; current->bio_list = bio_list; do { - WARN_ON_ONCE(bio->bi_disk != disk); + struct gendisk *disk = bio->bi_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; -- cgit v1.2.3 From 7bf137298cb73afdb8c5536e474f7ba71ab1c235 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 1 Jul 2020 21:58:57 +0800 Subject: blk-mq: streamline handling of q->mq_ops->queue_rq result Current handling of q->mq_ops->queue_rq result is a bit ugly: - two branches which needs to 'continue' have to check if the dispatch local list is empty, otherwise one bad request may be retrieved via 'rq = list_first_entry(list, struct request, queuelist);' - the branch of 'if (unlikely(ret != BLK_STS_OK))' isn't easy to follow, since it is actually one error branch. Streamline this handling, so the code becomes more readable, meantime potential kernel oops can be avoided in case that the last request in local dispatch list is failed. Fixes: fc17b6534eb8 ("blk-mq: switch ->queue_rq return value to blk_status_t") Signed-off-by: Ming Lei Reviewed-by: Johannes Thumshirn Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-mq.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index abcf590f6238..89c83fa97ba0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1387,30 +1387,28 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (nr_budgets) nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_handle_dev_resource(rq, list); + switch (ret) { + case BLK_STS_OK: + queued++; break; - } else if (ret == BLK_STS_ZONE_RESOURCE) { + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: /* * Move the request to zone_list and keep going through * the dispatch list to find more requests the drive can * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); - if (list_empty(list)) - break; - continue; - } - - if (unlikely(ret != BLK_STS_OK)) { + break; + default: errors++; blk_mq_end_request(rq, BLK_STS_IOERR); - continue; } - - queued++; } while (!list_empty(list)); - +out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); -- cgit v1.2.3 From 568f2700657794b8258e72983ba508793a658942 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 6 Jul 2020 22:41:11 +0800 Subject: blk-mq: centralise related handling into blk_mq_get_driver_tag Move .nr_active update and request assignment into blk_mq_get_driver_tag(), all are good to do during getting driver tag. Meantime blk-flush related code is simplified and flush request needn't to update the request table manually any more. Signed-off-by: Ming Lei Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-flush.c | 14 ++++---------- block/blk-mq-tag.h | 12 ------------ block/blk-mq.c | 31 +++++++++++++++---------------- block/blk.h | 5 ----- 4 files changed, 19 insertions(+), 43 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 15ae0155ec07..71c7f520e040 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -219,7 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - struct blk_mq_hw_ctx *hctx; blk_account_io_flush(flush_rq); @@ -235,13 +234,11 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (fq->rq_status != BLK_STS_OK) error = fq->rq_status; - hctx = flush_rq->mq_hctx; if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); - flush_rq->internal_tag = -1; + flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -316,13 +313,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { - fq->orig_rq = first_rq; + if (!q->elevator) flush_rq->tag = first_rq->tag; - blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); - } else { + else flush_rq->internal_tag = first_rq->internal_tag; - } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 3945c7f5b944..b1acac518c4e 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -101,18 +101,6 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return atomic_read(&hctx->nr_active) < depth; } -/* - * This helper should only be used for flush request to share tag - * with the request cloned from, and both the two requests can't be - * in flight at the same time. The caller has to make sure the tag - * can't be freed. - */ -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag, struct request *rq) -{ - hctx->tags->rqs[tag] = rq; -} - static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, unsigned int tag) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 89c83fa97ba0..b6dd080d3962 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -277,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; if (data->q->elevator) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; - data->hctx->tags->rqs[rq->tag] = rq; } /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; rq->mq_hctx = data->hctx; - rq->rq_flags = rq_flags; + rq->rq_flags = 0; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -1105,9 +1099,10 @@ static bool __blk_mq_get_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); int tag; + blk_mq_tag_busy(rq->mq_hctx); + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; @@ -1120,19 +1115,23 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; return true; } static bool blk_mq_get_driver_tag(struct request *rq) { - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && + !(rq->rq_flags & RQF_MQ_INFLIGHT)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + atomic_inc(&hctx->nr_active); + } + hctx->tags->rqs[rq->tag] = rq; + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, diff --git a/block/blk.h b/block/blk.h index 94f7c084f68f..9dcf51c94096 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,11 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - /* - * flush_rq shares tag with this rq, both can't be active - * at the same time - */ - struct request *orig_rq; struct lock_class_key key; spinlock_t mq_flush_lock; }; -- cgit v1.2.3 From a564e23f0f99759f453dbefcb9160dec6d99df96 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Jul 2020 14:25:41 +0200 Subject: md: switch to ->check_events for media change notifications md is the last driver using the legacy media_changed method. Switch it over to (not so) new ->clear_events approach, which also removes the need for the ->revalidate_disk method. Signed-off-by: Christoph Hellwig [axboe: remove unused 'bdops' variable in disk_clear_events()] Signed-off-by: Jens Axboe --- Documentation/filesystems/locking.rst | 4 +--- block/genhd.c | 8 +------- drivers/md/md.c | 19 ++++++++----------- include/linux/blkdev.h | 2 -- 4 files changed, 10 insertions(+), 23 deletions(-) (limited to 'block') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 318605de83f3..17bea12538c3 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -467,7 +467,6 @@ prototypes:: int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *); - int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); @@ -483,14 +482,13 @@ release: yes ioctl: no compat_ioctl: no direct_access: no -media_changed: no unlock_native_capacity: no revalidate_disk: no getgeo: no swap_slot_free_notify: no (see below) ======================= =================== -media_changed, unlock_native_capacity and revalidate_disk are called only from +unlock_native_capacity and revalidate_disk are called only from check_disk_change(). swap_slot_free_notify is called with swap_lock and sometimes the page lock diff --git a/block/genhd.c b/block/genhd.c index 60ae4e1b4d38..c42a49f2f537 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -2056,18 +2056,12 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) */ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) { - const struct block_device_operations *bdops = disk->fops; struct disk_events *ev = disk->ev; unsigned int pending; unsigned int clearing = mask; - if (!ev) { - /* for drivers still using the old ->media_changed method */ - if ((mask & DISK_EVENT_MEDIA_CHANGE) && - bdops->media_changed && bdops->media_changed(disk)) - return DISK_EVENT_MEDIA_CHANGE; + if (!ev) return 0; - } disk_block_events(disk); diff --git a/drivers/md/md.c b/drivers/md/md.c index 8bb69c61afe0..77dfe4765c31 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5670,6 +5670,7 @@ static int md_alloc(dev_t dev, char *name) * remove it now. */ disk->flags |= GENHD_FL_EXT_DEVT; + disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; /* As soon as we call add_disk(), another thread could get * through to md_open, so make sure it doesn't get too far @@ -7806,20 +7807,17 @@ static void md_release(struct gendisk *disk, fmode_t mode) mddev_put(mddev); } -static int md_media_changed(struct gendisk *disk) -{ - struct mddev *mddev = disk->private_data; - - return mddev->changed; -} - -static int md_revalidate(struct gendisk *disk) +static unsigned int md_check_events(struct gendisk *disk, unsigned int clearing) { struct mddev *mddev = disk->private_data; + unsigned int ret = 0; + if (mddev->changed) + ret = DISK_EVENT_MEDIA_CHANGE; mddev->changed = 0; - return 0; + return ret; } + static const struct block_device_operations md_fops = { .owner = THIS_MODULE, @@ -7831,8 +7829,7 @@ static const struct block_device_operations md_fops = .compat_ioctl = md_compat_ioctl, #endif .getgeo = md_getgeo, - .media_changed = md_media_changed, - .revalidate_disk= md_revalidate, + .check_events = md_check_events, }; static int md_thread(void *arg) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 408eb66a82fd..71173a1ffa8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1781,8 +1781,6 @@ struct block_device_operations { int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); - /* ->media_changed() is DEPRECATED, use ->check_events() instead */ - int (*media_changed) (struct gendisk *); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); -- cgit v1.2.3 From 8c911f3d4c074a17955a1757c9d1d5a9a5209ca5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 1 Jul 2020 11:06:21 +0200 Subject: writeback: remove struct bdi_writeback_congested We never set any congested bits in the group writeback instances of it. And for the simpler bdi-wide case a simple scalar field is all that that is needed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 19 +---- drivers/md/dm.c | 2 +- include/linux/backing-dev-defs.h | 25 +------ include/linux/backing-dev.h | 18 +---- include/linux/blk-cgroup.h | 6 -- mm/backing-dev.c | 149 +++------------------------------------ 6 files changed, 14 insertions(+), 205 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 594f1d0b0e5a..e00d0458a9d6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -95,9 +95,6 @@ static void __blkg_release(struct rcu_head *rcu) css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - blkg_free(blkg); } @@ -227,7 +224,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -245,31 +241,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(q->backing_dev_info, - blkcg->css.id, - GFP_NOWAIT | __GFP_NOWARN); - if (!wb_congested) { - ret = -ENOMEM; - goto err_put_css; - } - /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_congested; + goto err_put_css; } } blkg = new_blkg; - blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_congested; + goto err_put_css; } blkg_get(blkg->parent); } @@ -306,8 +293,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_put_congested: - wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 2d368cafb23e..48bfd41658aa 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1834,7 +1834,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * top-level queue for congestion. */ struct backing_dev_info *bdi = md->queue->backing_dev_info; - r = bdi->wb.congested->state & bdi_bits; + r = bdi->wb.congested & bdi_bits; } else { map = dm_get_live_table_fast(md); if (map) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index cc5aa1f32b91..1cec4521e1fb 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -87,26 +87,6 @@ struct wb_completion { #define DEFINE_WB_COMPLETION(cmpl, bdi) \ struct wb_completion cmpl = WB_COMPLETION_INIT(bdi) -/* - * For cgroup writeback, multiple wb's may map to the same blkcg. Those - * wb's can operate mostly independently but should share the congested - * state. To facilitate such sharing, the congested state is tracked using - * the following struct which is created on demand, indexed by blkcg ID on - * its bdi, and refcounted. - */ -struct bdi_writeback_congested { - unsigned long state; /* WB_[a]sync_congested flags */ - refcount_t refcnt; /* nr of attached wb's and blkg */ - -#ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *__bdi; /* the associated bdi, set to NULL - * on bdi unregistration. For memcg-wb - * internal use only! */ - int blkcg_id; /* ID of the associated blkcg */ - struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ -#endif -}; - /* * Each wb (bdi_writeback) can perform writeback operations, is measured * and throttled, independently. Without cgroup writeback, each bdi @@ -140,7 +120,7 @@ struct bdi_writeback { struct percpu_counter stat[NR_WB_STAT_ITEMS]; - struct bdi_writeback_congested *congested; + unsigned long congested; /* WB_[a]sync_congested flags */ unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; @@ -208,11 +188,8 @@ struct backing_dev_info { struct list_head wb_list; /* list of all wbs */ #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ - struct rb_root cgwb_congested_tree; /* their congested states */ struct mutex cgwb_release_mutex; /* protect shutdown of wb structs */ struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */ -#else - struct bdi_writeback_congested *wb_congested; #endif wait_queue_head_t wb_waitq; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6b3504bf7a42..9173d2c22b4a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -173,7 +173,7 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, cong_bits); - return wb->congested->state & cong_bits; + return wb->congested & cong_bits; } long congestion_wait(int sync, long timeout); @@ -224,9 +224,6 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK -struct bdi_writeback_congested * -wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); -void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, @@ -404,19 +401,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } -static inline struct bdi_writeback_congested * -wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) -{ - refcount_inc(&bdi->wb_congested->refcnt); - return bdi->wb_congested; -} - -static inline void wb_congested_put(struct bdi_writeback_congested *congested) -{ - if (refcount_dec_and_test(&congested->refcnt)) - kfree(congested); -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 431b2d18bf40..c8fc9792ac77 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -109,12 +109,6 @@ struct blkcg_gq { struct hlist_node blkcg_node; struct blkcg *blkcg; - /* - * Each blkg gets congested separately and the congestion state is - * propagated to the matching bdi_writeback_congested. - */ - struct bdi_writeback_congested *wb_congested; - /* all non-root blkcg_gq's are guaranteed to have access to parent */ struct blkcg_gq *parent; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3ebe5144a102..8e8b00627bb2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -281,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, - int blkcg_id, gfp_t gfp) + gfp_t gfp) { int i, err; @@ -308,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; - wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) { - err = -ENOMEM; - goto out_put_bdi; - } - err = fprop_local_init_percpu(&wb->completions, gfp); if (err) - goto out_put_cong; + goto out_put_bdi; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); @@ -330,8 +324,6 @@ out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); -out_put_cong: - wb_congested_put(wb->congested); out_put_bdi: if (wb != &bdi->wb) bdi_put(bdi); @@ -374,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); - wb_congested_put(wb->congested); if (wb != &wb->bdi->wb) bdi_put(wb->bdi); } @@ -384,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb) #include /* - * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, - * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. + * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. + * bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; -/** - * wb_congested_get_create - get or create a wb_congested - * @bdi: associated bdi - * @blkcg_id: ID of the associated blkcg - * @gfp: allocation mask - * - * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. - * The returned wb_congested has its reference count incremented. Returns - * NULL on failure. - */ -struct bdi_writeback_congested * -wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) -{ - struct bdi_writeback_congested *new_congested = NULL, *congested; - struct rb_node **node, *parent; - unsigned long flags; -retry: - spin_lock_irqsave(&cgwb_lock, flags); - - node = &bdi->cgwb_congested_tree.rb_node; - parent = NULL; - - while (*node != NULL) { - parent = *node; - congested = rb_entry(parent, struct bdi_writeback_congested, - rb_node); - if (congested->blkcg_id < blkcg_id) - node = &parent->rb_left; - else if (congested->blkcg_id > blkcg_id) - node = &parent->rb_right; - else - goto found; - } - - if (new_congested) { - /* !found and storage for new one already allocated, insert */ - congested = new_congested; - rb_link_node(&congested->rb_node, parent, node); - rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - spin_unlock_irqrestore(&cgwb_lock, flags); - return congested; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - - /* allocate storage for new one and retry */ - new_congested = kzalloc(sizeof(*new_congested), gfp); - if (!new_congested) - return NULL; - - refcount_set(&new_congested->refcnt, 1); - new_congested->__bdi = bdi; - new_congested->blkcg_id = blkcg_id; - goto retry; - -found: - refcount_inc(&congested->refcnt); - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(new_congested); - return congested; -} - -/** - * wb_congested_put - put a wb_congested - * @congested: wb_congested to put - * - * Put @congested and destroy it if the refcnt reaches zero. - */ -void wb_congested_put(struct bdi_writeback_congested *congested) -{ - unsigned long flags; - - if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) - return; - - /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->__bdi) { - rb_erase(&congested->rb_node, - &congested->__bdi->cgwb_congested_tree); - congested->__bdi = NULL; - } - - spin_unlock_irqrestore(&cgwb_lock, flags); - kfree(congested); -} - static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, @@ -558,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi, goto out_put; } - ret = wb_init(wb, bdi, blkcg_css->id, gfp); + ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; @@ -696,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); - bdi->cgwb_congested_tree = RB_ROOT; mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); - ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); + ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; @@ -769,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - struct rb_node *rbn; - - spin_lock_irq(&cgwb_lock); - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->__bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); @@ -810,29 +698,11 @@ subsys_initcall(cgwb_init); static int cgwb_bdi_init(struct backing_dev_info *bdi) { - int err; - - bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL); - if (!bdi->wb_congested) - return -ENOMEM; - - refcount_set(&bdi->wb_congested->refcnt, 1); - - err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); - if (err) { - wb_congested_put(bdi->wb_congested); - return err; - } - return 0; + return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } -static void cgwb_bdi_exit(struct backing_dev_info *bdi) -{ - wb_congested_put(bdi->wb_congested); -} - static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); @@ -1023,7 +893,6 @@ static void release_bdi(struct kref *ref) bdi_unregister(bdi); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); - cgwb_bdi_exit(bdi); kfree(bdi); } @@ -1053,7 +922,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.congested->state)) + if (test_and_clear_bit(bit, &bdi->wb.congested)) atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) @@ -1066,7 +935,7 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.congested->state)) + if (!test_and_set_bit(bit, &bdi->wb.congested)) atomic_inc(&nr_wb_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); -- cgit v1.2.3 From 106e71c51287b9bdf9d400cac657a9af7f0f1e3d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 4 Jul 2020 15:26:14 +0800 Subject: blk-mq: Remove unnecessary local variable Remove unnecessary local variable 'ret' in blk_mq_dispatch_hctx_list(). Signed-off-by: Baolin Wang Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 1c52e56a19b1..b8db72cf1043 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -96,7 +96,6 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) struct request *rq; LIST_HEAD(hctx_list); unsigned int count = 0; - bool ret; list_for_each_entry(rq, rq_list, queuelist) { if (rq->mq_hctx != hctx) { @@ -108,8 +107,7 @@ static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) list_splice_tail_init(rq_list, &hctx_list); dispatch: - ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); - return ret; + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ -- cgit v1.2.3 From 87890092ee6504a49841d854ff81a71a85c19a5d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 4 Jul 2020 15:28:21 +0800 Subject: blk-mq: remove redundant validation in __blk_mq_end_request() We've already validated the 'q->elevator' before calling ->ops.completed_request() in blk_mq_sched_completed_request(), thus no need to validate rq->internal_tag again. Rmove it. Signed-off-by: Baolin Wang Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index b6dd080d3962..c3856377b961 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -544,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) blk_stat_add(rq, now); } - if (rq->internal_tag != BLK_MQ_NO_TAG) - blk_mq_sched_completed_request(rq, now); + blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); -- cgit v1.2.3 From 9054650fac24b784df8500aba2869ebf240d069a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 13 Jul 2020 15:48:16 -0600 Subject: block: relax jiffies rounding for timeouts In doing high IOPS testing, blk-mq is generally pretty well optimized. There are a few things that stuck out as using more CPU than what is really warranted, and one thing is the round_jiffies_up() that we do twice for each request. That accounts for about 0.8% of the CPU in my testing. We can make this cheaper by avoiding an integer division, by just adding a rough HZ mask that we can AND with instead. The timeouts are only on a second granularity already, we don't have to be that accurate here and this patch barely changes that. All we care about is nice grouping. Signed-off-by: Jens Axboe --- block/blk-timeout.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 3a1ac6434758..8ab8a82825cd 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -88,11 +88,29 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +static unsigned long blk_timeout_mask __read_mostly; + +int __init blk_timeout_init(void) +{ + blk_timeout_mask = roundup_pow_of_two(HZ) - 1; + return 0; +} + +late_initcall(blk_timeout_init); + +/* + * Just a rough estimate, we don't care about specific values for timeouts. + */ +static inline unsigned long blk_round_jiffies(unsigned long j) +{ + return (j + blk_timeout_mask) + 1; +} + unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; - maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; @@ -129,7 +147,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(expiry)); + expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { -- cgit v1.2.3 From d0f0f1b4c55e9e1b10e47399f544be52567d9fc3 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 15 Jul 2020 16:36:19 +0800 Subject: block: always remove partitions from blk_drop_partitions() In theory, when GENHD_FL_NO_PART_SCAN is set, no partitions can be created on one disk. However, ioctl(BLKPG, BLKPG_ADD_PARTITION) doesn't check GENHD_FL_NO_PART_SCAN, so partitions still can be added even though GENHD_FL_NO_PART_SCAN is set. So far blk_drop_partitions() only removes partitions when disk_part_scan_enabled() return true. This way can make ghost partition on loop device after changing/clearing FD in case that PARTSCAN is disabled, such as partitions can be added via 'parted' on loop disk even though GENHD_FL_NO_PART_SCAN is set. Fix this issue by always removing partitions in blk_drop_partitions(), and this way is correct because the current code supposes that no partitions can be added in case of GENHD_FL_NO_PART_SCAN. Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- block/partitions/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/partitions/core.c b/block/partitions/core.c index 78951e33b2d7..e62a98a8eeb7 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -619,8 +619,6 @@ int blk_drop_partitions(struct block_device *bdev) struct disk_part_iter piter; struct hd_struct *part; - if (!disk_part_scan_enabled(bdev->bd_disk)) - return 0; if (bdev->bd_part_count) return -EBUSY; -- cgit v1.2.3 From e791ee6885f7e9e32c3e551cadea8bc6effaae1e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 15 Jul 2020 09:33:37 -0600 Subject: Revert "blk-rq-qos: remove redundant finish_wait to rq_qos_wait." This reverts commit 826f2f48da8c331ac51e1381998d318012d66550. Qian Cai reports that this commit causes stalls with swap. Revert until the reason can be figured out. Reported-by: Qian Cai Signed-off-by: Jens Axboe --- block/blk-rq-qos.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block') diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 18f3eab9f768..656460636ad3 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -273,6 +273,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token -- cgit v1.2.3 From a43f085f8720c4f705b86e7432d19ef955750b36 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Fri, 19 Jun 2020 17:23:17 +0206 Subject: block: remove unnecessary ioc nested locking The legacy CFQ IO scheduler could call put_io_context() in its exit_icq() elevator callback. This led to a lockdep warning, which was fixed in commit d8c66c5d5924 ("block: fix lockdep warning on io_context release put_io_context()") by using a nested subclass for the ioc spinlock. However, with commit f382fb0bcef4 ("block: remove legacy IO schedulers") the CFQ IO scheduler no longer exists. The BFQ IO scheduler also implements the exit_icq() elevator callback but does not call put_io_context(). The nested subclass for the ioc spinlock is no longer needed. Since it existed as an exception and no longer applies, remove the nested subclass usage. Signed-off-by: John Ogness Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-ioc.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9df50fb507ca..5dbcfa1b872e 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -96,15 +96,7 @@ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); - unsigned long flags; - - /* - * Exiting icq may call into put_io_context() through elevator - * which will trigger lockdep warning. The ioc's are guaranteed to - * be different, use a different locking subclass here. Use - * irqsave variant as there's no spin_lock_irq_nested(). - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -115,13 +107,13 @@ static void ioc_release_fn(struct work_struct *work) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); cpu_relax(); - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); } } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } @@ -170,7 +162,6 @@ void put_io_context(struct io_context *ioc) */ void put_io_context_active(struct io_context *ioc) { - unsigned long flags; struct io_cq *icq; if (!atomic_dec_and_test(&ioc->active_ref)) { @@ -178,19 +169,14 @@ void put_io_context_active(struct io_context *ioc) return; } - /* - * Need ioc lock to walk icq_list and q lock to exit icq. Perform - * reverse double locking. Read comment in ioc_release_fn() for - * explanation on the nested locking annotation. - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; ioc_exit_icq(icq); } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); put_io_context(ioc); } -- cgit v1.2.3 From ab96bbab467c884ad684c5f669c91272a0455087 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Fri, 19 Jun 2020 17:23:18 +0206 Subject: block: remove retry loop in ioc_release_fn() The reverse-order double lock dance in ioc_release_fn() is using a retry loop. This is a problem on PREEMPT_RT because it could preempt the task that would release q->queue_lock and thus live lock in the retry loop. RCU is already managing the freeing of the request queue and icq. If the trylock fails, use RCU to guarantee that the request queue and icq are not freed and re-acquire the locks in the correct order, allowing forward progress. Signed-off-by: John Ogness Reviewed-by: Daniel Wagner Signed-off-by: Jens Axboe --- block/blk-ioc.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 5dbcfa1b872e..57299f860d41 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -107,9 +107,23 @@ static void ioc_release_fn(struct work_struct *work) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { - spin_unlock_irq(&ioc->lock); - cpu_relax(); - spin_lock_irq(&ioc->lock); + /* Make sure q and icq cannot be freed. */ + rcu_read_lock(); + + /* Re-acquire the locks in the correct order. */ + spin_unlock(&ioc->lock); + spin_lock(&q->queue_lock); + spin_lock(&ioc->lock); + + /* + * The icq may have been destroyed when the ioc lock + * was released. + */ + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + + spin_unlock(&q->queue_lock); + rcu_read_unlock(); } } -- cgit v1.2.3 From 943c4d9074aa05b86af89eec25330c1995e807c9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 17 Jul 2020 18:00:02 +0800 Subject: block: make blk_timeout_init() static The sparse tool complains as follows: block/blk-timeout.c:93:12: warning: symbol 'blk_timeout_init' was not declared. Should it be static? Function blk_timeout_init() is not used outside of blk-timeout.c, so mark it static. Fixes: 9054650fac24 ("block: relax jiffies rounding for timeouts") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Jens Axboe --- block/blk-timeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8ab8a82825cd..4c1fc3417460 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL_GPL(blk_abort_request); static unsigned long blk_timeout_mask __read_mostly; -int __init blk_timeout_init(void) +static int __init blk_timeout_init(void) { blk_timeout_mask = roundup_pow_of_two(HZ) - 1; return 0; -- cgit v1.2.3 From b5718d6c00afe51ce5327cf2beae4019da0c8e74 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Thu, 16 Jul 2020 02:52:01 -0400 Subject: block: defer flush request no matter whether we have elevator Commit 7520872c0cf4 ("block: don't defer flushes on blk-mq + scheduling") tried to fix deadlock for cycled wait between flush requests and data request into flush_data_in_flight. The former holded all driver tags and wait for data request completion, but the latter can not complete for waiting free driver tags. After commit 923218f6166a ("blk-mq: don't allocate driver tag upfront for flush rq"), flush requests will not get driver tag before queuing into flush queue. * With elevator, flush request just get sched_tags before inserting flush queue. It will not get driver tag until issue them to driver. data request on list fq->flush_data_in_flight will complete in the end. * Without elevator, each flush request will get a driver tag when allocate request. Then data request on fq->flush_data_in_flight don't worry about lacking driver tag. In both of these cases, cycled wait cannot be true. So we may allow to defer flush request. Signed-off-by: Yufen Yu Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-flush.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 71c7f520e040..6e1543c10493 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -283,13 +283,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; - /* C2 and C3 - * - * For blk-mq + scheduling, we can risk having all driver tags - * assigned to empty flushes, and we deadlock if we are expecting - * other requests to make progress. Don't defer for that case. - */ - if (!list_empty(&fq->flush_data_in_flight) && q->elevator && + /* C2 and C3 */ + if (!list_empty(&fq->flush_data_in_flight) && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; -- cgit v1.2.3 From 9b15d109a6b2c0c604c588d49a5a927dac6dd616 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 17 Jul 2020 10:42:30 +0800 Subject: block: improve discard bio alignment in __blkdev_issue_discard() This patch improves discard bio split for address and size alignment in __blkdev_issue_discard(). The aligned discard bio may help underlying device controller to perform better discard and internal garbage collection, and avoid unnecessary internal fragment. Current discard bio split algorithm in __blkdev_issue_discard() may have non-discarded fregment on device even the discard bio LBA and size are both aligned to device's discard granularity size. Here is the example steps on how to reproduce the above problem. - On a VMWare ESXi 6.5 update3 installation, create a 51GB virtual disk with thin mode and give it to a Linux virtual machine. - Inside the Linux virtual machine, if the 50GB virtual disk shows up as /dev/sdb, fill data into the first 50GB by, # dd if=/dev/zero of=/dev/sdb bs=4096 count=13107200 - Discard the 50GB range from offset 0 on /dev/sdb, # blkdiscard /dev/sdb -o 0 -l 53687091200 - Observe the underlying mapping status of the device # sg_get_lba_status /dev/sdb -m 1048 --lba=0 descriptor LBA: 0x0000000000000000 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000000000800 blocks: 16773120 deallocated descriptor LBA: 0x0000000000fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000001000000 blocks: 8386560 deallocated descriptor LBA: 0x00000000017ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000001800000 blocks: 8386560 deallocated descriptor LBA: 0x0000000001fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000002000000 blocks: 8386560 deallocated descriptor LBA: 0x00000000027ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000002800000 blocks: 8386560 deallocated descriptor LBA: 0x0000000002fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000003000000 blocks: 8386560 deallocated descriptor LBA: 0x00000000037ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000003800000 blocks: 8386560 deallocated descriptor LBA: 0x0000000003fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000004000000 blocks: 8386560 deallocated descriptor LBA: 0x00000000047ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000004800000 blocks: 8386560 deallocated descriptor LBA: 0x0000000004fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000005000000 blocks: 8386560 deallocated descriptor LBA: 0x00000000057ff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000005800000 blocks: 8386560 deallocated descriptor LBA: 0x0000000005fff800 blocks: 2048 mapped (or unknown) descriptor LBA: 0x0000000006000000 blocks: 6291456 deallocated descriptor LBA: 0x0000000006600000 blocks: 0 deallocated Although the discard bio starts at LBA 0 and has 50<<30 bytes size which are perfect aligned to the discard granularity, from the above list these are many 1MB (2048 sectors) internal fragments exist unexpectedly. The problem is in __blkdev_issue_discard(), an improper algorithm causes an improper bio size which is not aligned. 25 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 26 sector_t nr_sects, gfp_t gfp_mask, int flags, 27 struct bio **biop) 28 { 29 struct request_queue *q = bdev_get_queue(bdev); [snipped] 56 57 while (nr_sects) { 58 sector_t req_sects = min_t(sector_t, nr_sects, 59 bio_allowed_max_sectors(q)); 60 61 WARN_ON_ONCE((req_sects << 9) > UINT_MAX); 62 63 bio = blk_next_bio(bio, 0, gfp_mask); 64 bio->bi_iter.bi_sector = sector; 65 bio_set_dev(bio, bdev); 66 bio_set_op_attrs(bio, op, 0); 67 68 bio->bi_iter.bi_size = req_sects << 9; 69 sector += req_sects; 70 nr_sects -= req_sects; [snipped] 79 } 80 81 *biop = bio; 82 return 0; 83 } 84 EXPORT_SYMBOL(__blkdev_issue_discard); At line 58-59, to discard a 50GB range, req_sects is set as return value of bio_allowed_max_sectors(q), which is 8388607 sectors. In the above case, the discard granularity is 2048 sectors, although the start LBA and discard length are aligned to discard granularity, req_sects never has chance to be aligned to discard granularity. This is why there are some still-mapped 2048 sectors fragment in every 4 or 8 GB range. If req_sects at line 58 is set to a value aligned to discard_granularity and close to UNIT_MAX, then all consequent split bios inside device driver are (almostly) aligned to discard_granularity of the device queue. The 2048 sectors still-mapped fragment will disappear. This patch introduces bio_aligned_discard_max_sectors() to return the the value which is aligned to q->limits.discard_granularity and closest to UINT_MAX. Then this patch replaces bio_allowed_max_sectors() with this new routine to decide a more proper split bio length. But we still need to handle the situation when discard start LBA is not aligned to q->limits.discard_granularity, otherwise even the length is aligned, current code may still leave 2048 fragment around every 4GB range. Therefore, to calculate req_sects, firstly the start LBA of discard range is checked (including partition offset), if it is not aligned to discard granularity, the first split location should make sure following bio has bi_sector aligned to discard granularity. Then there won't be still-mapped fragment in the middle of the discard range. The above is how this patch improves discard bio alignment in __blkdev_issue_discard(). Now with this patch, after discard with same command line mentiond previously, sg_get_lba_status returns, descriptor LBA: 0x0000000000000000 blocks: 106954752 deallocated descriptor LBA: 0x0000000006600000 blocks: 0 deallocated We an see there is no 2048 sectors segment anymore, everything is clean. Reported-and-tested-by: Acshai Manoj Signed-off-by: Coly Li Reviewed-by: Hannes Reinecke Reviewed-by: Ming Lei Reviewed-by: Xiao Ni Cc: Bart Van Assche Cc: Christoph Hellwig Cc: Enzo Matsumiya Cc: Jens Axboe Signed-off-by: Jens Axboe --- block/blk-lib.c | 31 ++++++++++++++++++++++++++++--- block/blk.h | 14 ++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-lib.c b/block/blk-lib.c index 5f2c429d4378..019e09bb9c0e 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -29,7 +29,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; unsigned int op; - sector_t bs_mask; + sector_t bs_mask, part_offset = 0; if (!q) return -ENXIO; @@ -54,9 +54,34 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if (!nr_sects) return -EINVAL; + /* In case the discard request is in a partition */ + if (bdev->bd_partno) + part_offset = bdev->bd_part->start_sect; + while (nr_sects) { - sector_t req_sects = min_t(sector_t, nr_sects, - bio_allowed_max_sectors(q)); + sector_t granularity_aligned_lba, req_sects; + sector_t sector_mapped = sector + part_offset; + + granularity_aligned_lba = round_up(sector_mapped, + q->limits.discard_granularity >> SECTOR_SHIFT); + + /* + * Check whether the discard bio starts at a discard_granularity + * aligned LBA, + * - If no: set (granularity_aligned_lba - sector_mapped) to + * bi_size of the first split bio, then the second bio will + * start at a discard_granularity aligned LBA on the device. + * - If yes: use bio_aligned_discard_max_sectors() as the max + * possible bi_size of the first split bio. Then when this bio + * is split in device drive, the split ones are very probably + * to be aligned to discard_granularity of the device's queue. + */ + if (granularity_aligned_lba == sector_mapped) + req_sects = min_t(sector_t, nr_sects, + bio_aligned_discard_max_sectors(q)); + else + req_sects = min_t(sector_t, nr_sects, + granularity_aligned_lba - sector_mapped); WARN_ON_ONCE((req_sects << 9) > UINT_MAX); diff --git a/block/blk.h b/block/blk.h index 9dcf51c94096..49e2928a1632 100644 --- a/block/blk.h +++ b/block/blk.h @@ -264,6 +264,20 @@ static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; } +/* + * The max bio size which is aligned to q->limits.discard_granularity. This + * is a hint to split large discard bio in generic block layer, then if device + * driver needs to split the discard bio into smaller ones, their bi_size can + * be very probably and easily aligned to discard_granularity of the device's + * queue. + */ +static inline unsigned int bio_aligned_discard_max_sectors( + struct request_queue *q) +{ + return round_down(UINT_MAX, q->limits.discard_granularity) >> + SECTOR_SHIFT; +} + /* * Internal io_context interface */ -- cgit v1.2.3 From cd1fc4b98fb5953a220d690d45b11470fd9325d6 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Jun 2020 13:11:43 -0700 Subject: blk-cgroup: make iostat functions visible to stat printing Previously, the code which printed io.stat only needed access to the generic rstat flushing code, but since we plan to write some more specific code for preparing root cgroup stats, we need to manipulate iostat structs directly. Since declaring static functions ahead does not seem like common practice in this file, simply move the iostat functions up. We only plan to use blkg_iostat_set, but it seems better to keep them all together. Signed-off-by: Boris Burkov Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 142 ++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 71 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e00d0458a9d6..696d28151c9a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -711,6 +711,77 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx) } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); @@ -1086,77 +1157,6 @@ static int blkcg_can_attach(struct cgroup_taskset *tset) return ret; } -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; -- cgit v1.2.3 From ef45fe470e1e5410db4af87abc5d5055427945ac Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 1 Jun 2020 13:12:05 -0700 Subject: blk-cgroup: show global disk stats in root cgroup io.stat In order to improve consistency and usability in cgroup stat accounting, we would like to support the root cgroup's io.stat. Since the root cgroup has processes doing io even if the system has no explicitly created cgroups, we need to be careful to avoid overhead in that case. For that reason, the rstat algorithms don't handle the root cgroup, so just turning the file on wouldn't give correct statistics. To get around this, we simulate flushing the iostat struct by filling it out directly from global disk stats. The result is a root cgroup io.stat file consistent with both /proc/diskstats and io.stat. Note that in order to collect the disk stats, we needed to iterate over devices. To facilitate that, we had to change the linkage of a disk_type to external so that it can be used from blk-cgroup.c to iterate over disks. Suggested-by: Tejun Heo Signed-off-by: Boris Burkov Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- Documentation/admin-guide/cgroup-v2.rst | 3 +- block/blk-cgroup.c | 57 +++++++++++++++++++++++++++++++-- block/genhd.c | 4 +-- include/linux/genhd.h | 1 + 4 files changed, 58 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index ce3e05e41724..2d7c5907e0ce 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1483,8 +1483,7 @@ IO Interface Files ~~~~~~~~~~~~~~~~~~ io.stat - A read-only nested-keyed file which exists on non-root - cgroups. + A read-only nested-keyed file. Lines are keyed by $MAJ:$MIN device numbers and not ordered. The following nested keys are defined. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 696d28151c9a..619a79b51068 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -782,12 +782,66 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) rcu_read_unlock(); } +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + } +} + static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; - cgroup_rstat_flush(blkcg->css.cgroup); + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -876,7 +930,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ diff --git a/block/genhd.c b/block/genhd.c index c42a49f2f537..8b1e9f48957c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -38,8 +38,6 @@ static struct kobject *block_depr; static DEFINE_SPINLOCK(ext_devt_lock); static DEFINE_IDR(ext_devt_idr); -static const struct device_type disk_type; - static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); static void disk_alloc_events(struct gendisk *disk); @@ -1587,7 +1585,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 31a54072ffd6..4ab853461dff 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -24,6 +24,7 @@ #define disk_to_dev(disk) (&(disk)->part0.__dev) #define part_to_dev(part) (&((part)->__dev)) +extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; -- cgit v1.2.3 From 08c875cbf481d74db82d6bba2fbcf580087dee24 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 28 Jul 2020 15:29:51 +0200 Subject: block: Use non _rcu version of list functions for tag_set_list tag_set_list is only accessed under the tag_set_lock lock. There is no need for using the _rcu list functions. The _rcu list function were introduced to allow read access to the tag_set_list protected under RCU, see 705cda97ee3a ("blk-mq: Make it safe to use RCU to iterate over blk_mq_tag_set.tag_list") and 05b79413946d ("Revert "blk-mq: don't handle TAG_SHARED in restart""). Those changes got reverted later but the cleanup commit missed a couple of places to undo the changes. Fixes: 97889f9ac24f ("blk-mq: remove synchronize_rcu() from blk_mq_del_queue_tag_set()" Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Cc: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c3856377b961..c04e2452dd82 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2899,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); - list_del_rcu(&q->tag_set_list); + list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_SHARED; @@ -2926,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, } if (set->flags & BLK_MQ_F_TAG_SHARED) queue_set_hctx_shared(q, true); - list_add_tail_rcu(&q->tag_set_list, &set->tag_list); + list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } -- cgit v1.2.3 From d9012a59db54442d5b2fcfdfcded35cf566397d3 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Thu, 30 Jul 2020 17:03:21 +0800 Subject: iocost: Fix check condition of iocg abs_vdebt We shouldn't skip iocg when its abs_vdebt is not zero. Fixes: 0b80f9866e6b ("iocost: protect iocg->abs_vdebt with iocg->waitq.lock") Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index cea5ee9be639..521c29b8ae29 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1370,7 +1370,7 @@ static void ioc_timer_fn(struct timer_list *timer) * should have woken up in the last period and expire idle iocgs. */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg_is_idle(iocg)) continue; -- cgit v1.2.3 From f06678af91a42c27002a70148fe95899a34afb07 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:27 -0700 Subject: block: bfq-iosched: fix duplicated word Change "at at" to "at a". Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/bfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c8f034c01c..a4c0bec920cb 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -4714,7 +4714,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * some unlucky request wait for as long as the device * wishes. * - * Of course, serving one request at at time may cause loss of + * Of course, serving one request at a time may cause loss of * throughput. */ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -- cgit v1.2.3 From 3cf148891799d46542d8880c678e931c7ac5b32e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:28 -0700 Subject: block: bio: delete duplicated words Drop the repeated words "a" and "the". Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index ef91782fd668..c63ba04bd629 100644 --- a/block/bio.c +++ b/block/bio.c @@ -862,7 +862,7 @@ EXPORT_SYMBOL(bio_add_pc_page); * @same_page: return if the segment has been merged inside the same page * * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the + * useful optimisation for file systems with a block size smaller than the * page size. * * Warn if (@len, @off) crosses pages in case that @same_page is true. @@ -988,7 +988,7 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) * Pins pages from *iter and appends them to @bio's bvec array. The * pages will have to be released using put_page() when done. * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * next non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { -- cgit v1.2.3 From 5b8f65e1f9664171a4f4c07f8cdc212650f00e77 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:29 -0700 Subject: block: elevator: delete duplicated word and fix typos Drop the repeated word "the". Fix typos of "features" and "specified". Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/elevator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/elevator.c b/block/elevator.c index 4eab3d70e880..90ed7a28c21d 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -95,8 +95,8 @@ static inline bool elv_support_features(unsigned int elv_features, * @name: Elevator name to test * @required_features: Features that the elevator must provide * - * Return true is the elevator @e name matches @name and if @e provides all the - * the feratures spcified by @required_features. + * Return true if the elevator @e name matches @name and if @e provides all + * the features specified by @required_features. */ static bool elevator_match(const struct elevator_type *e, const char *name, unsigned int required_features) -- cgit v1.2.3 From 0d20dcc277cfb50ec1de4db0d758f30e8f597d30 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:30 -0700 Subject: block: genhd: delete duplicated words Drop the repeated word "to" in multiple places. Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/genhd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8b1e9f48957c..99c64641c314 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1773,7 +1773,7 @@ EXPORT_SYMBOL(__alloc_disk_node); /** * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to to increment the refcount for + * @disk: the struct gendisk to increment the refcount for * * This increments the refcount for the struct gendisk, and the gendisk's * fops module owner. @@ -1802,7 +1802,7 @@ EXPORT_SYMBOL(get_disk_and_module); /** * put_disk - decrements the gendisk refcount - * @disk: the struct gendisk to to decrement the refcount for + * @disk: the struct gendisk to decrement the refcount for * * This decrements the refcount for the struct gendisk. When this reaches 0 * we'll have disk_release() called. @@ -1819,7 +1819,7 @@ EXPORT_SYMBOL(put_disk); /** * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to to decrement the refcount for + * @disk: the struct gendisk to decrement the refcount for * * This is a counterpart of get_disk_and_module() and thus also of * get_gendisk(). -- cgit v1.2.3 From 70f15a4fd9a334f5f138045526f0035e77394e40 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:31 -0700 Subject: block: blk-mq: delete duplicated word Drop the repeated word "the". Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index c04e2452dd82..16099a7850a1 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -388,7 +388,7 @@ retry: /* * Give up the CPU and sleep for a random short time to ensure * that thread using a realtime scheduling class are migrated - * off the the CPU, and thus off the hctx that is going away. + * off the CPU, and thus off the hctx that is going away. */ msleep(3); goto retry; -- cgit v1.2.3 From c4aecaa256904bb1f46a8d898034b0c725b45015 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:32 -0700 Subject: block: blk-mq-sched: delete duplicated word Drop the repeated word "to". Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-mq-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index b8db72cf1043..a19cdf159b75 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -225,7 +225,7 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to - * to be run again. This is necessary to avoid starving flushes. + * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { -- cgit v1.2.3 From d958e343bdc3de2643ce25225bed082dc222858d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 30 Jul 2020 18:42:33 -0700 Subject: block: blk-timeout: delete duplicated word Drop the repeated word "request". Change to the correct kernel-doc notation for function name separtor. Signed-off-by: Randy Dunlap Cc: Jens Axboe Cc: linux-block@vger.kernel.org Signed-off-by: Jens Axboe --- block/blk-timeout.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 4c1fc3417460..1b8de0417fc1 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -68,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** - * blk_abort_request -- Request request recovery for the specified command + * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the -- cgit v1.2.3