summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 22:12:56 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-30 22:12:56 +0300
commitdf668a5fe461bb9d7e899c538acc7197746038f4 (patch)
tree315a71104f5cea7feeb56c9f2c768453408b72f7 /block
parentdf04fbe8680bfe07f3d7487eccff9f768bb02533 (diff)
parent2705dfb2094777e405e065105e307074af8965c1 (diff)
downloadlinux-df668a5fe461bb9d7e899c538acc7197746038f4.tar.xz
Merge tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block
Pull core block updates from Jens Axboe: - disk events cleanup (Christoph) - gendisk and request queue allocation simplifications (Christoph) - bdev_disk_changed cleanups (Christoph) - IO priority improvements (Bart) - Chained bio completion trace fix (Edward) - blk-wbt fixes (Jan) - blk-wbt enable/disable fix (Zhang) - Scheduler dispatch improvements (Jan, Ming) - Shared tagset scheduler improvements (John) - BFQ updates (Paolo, Luca, Pietro) - BFQ lock inversion fix (Jan) - Documentation improvements (Kir) - CLONE_IO block cgroup fix (Tejun) - Remove of ancient and deprecated block dump feature (zhangyi) - Discard merge fix (Ming) - Misc fixes or followup fixes (Colin, Damien, Dan, Long, Max, Thomas, Yang) * tag 'for-5.14/block-2021-06-29' of git://git.kernel.dk/linux-block: (129 commits) block: fix discard request merge block/mq-deadline: Remove a WARN_ON_ONCE() call blk-mq: update hctx->dispatch_busy in case of real scheduler blk: Fix lock inversion between ioc lock and bfqd lock bfq: Remove merged request already in bfq_requests_merged() block: pass a gendisk to bdev_disk_changed block: move bdev_disk_changed block: add the events* attributes to disk_attrs block: move the disk events code to a separate file block: fix trace completion for chained bio block/partitions/msdos: Fix typo inidicator -> indicator block, bfq: reset waker pointer with shared queues block, bfq: check waker only for queues with no in-flight I/O block, bfq: avoid delayed merge of async queues block, bfq: boost throughput by extending queue-merging times block, bfq: consider also creation time in delayed stable merge block, bfq: fix delayed stable merge check block, bfq: let also stably merged queues enjoy weight raising blk-wbt: make sure throttle is enabled properly blk-wbt: introduce a new disable state to prevent false positive by rwb_enabled() ...
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig19
-rw-r--r--block/Kconfig.iosched6
-rw-r--r--block/Makefile6
-rw-r--r--block/bfq-iosched.c115
-rw-r--r--block/bio.c13
-rw-r--r--block/blk-cgroup.c41
-rw-r--r--block/blk-core.c22
-rw-r--r--block/blk-flush.c3
-rw-r--r--block/blk-ioprio.c262
-rw-r--r--block/blk-ioprio.h19
-rw-r--r--block/blk-merge.c27
-rw-r--r--block/blk-mq-debugfs.c15
-rw-r--r--block/blk-mq-sched.c99
-rw-r--r--block/blk-mq-sched.h5
-rw-r--r--block/blk-mq-tag.c114
-rw-r--r--block/blk-mq-tag.h15
-rw-r--r--block/blk-mq.c206
-rw-r--r--block/blk-mq.h14
-rw-r--r--block/blk-rq-qos.c4
-rw-r--r--block/blk-rq-qos.h38
-rw-r--r--block/blk-sysfs.c45
-rw-r--r--block/blk-wbt.c12
-rw-r--r--block/blk-wbt.h1
-rw-r--r--block/blk.h17
-rw-r--r--block/disk-events.c469
-rw-r--r--block/elevator.c17
-rw-r--r--block/genhd.c701
-rw-r--r--block/ioctl.c2
-rw-r--r--block/mq-deadline-cgroup.c126
-rw-r--r--block/mq-deadline-cgroup.h114
-rw-r--r--block/mq-deadline-main.c1175
-rw-r--r--block/mq-deadline.c815
-rw-r--r--block/partitions/core.c129
-rw-r--r--block/partitions/msdos.c2
34 files changed, 2911 insertions, 1757 deletions
diff --git a/block/Kconfig b/block/Kconfig
index a2297edfdde8..e71c63eaaf52 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -133,6 +133,13 @@ config BLK_WBT
dynamically on an algorithm loosely based on CoDel, factoring in
the realtime performance of the disk.
+config BLK_WBT_MQ
+ bool "Enable writeback throttling by default"
+ default y
+ depends on BLK_WBT
+ help
+ Enable writeback throttling by default for request-based block devices.
+
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
depends on BLK_CGROUP=y
@@ -155,12 +162,14 @@ config BLK_CGROUP_IOCOST
distributes IO capacity between different groups based on
their share of the overall weight distribution.
-config BLK_WBT_MQ
- bool "Multiqueue writeback throttling"
- default y
- depends on BLK_WBT
+config BLK_CGROUP_IOPRIO
+ bool "Cgroup I/O controller for assigning an I/O priority class"
+ depends on BLK_CGROUP
help
- Enable writeback throttling by default on multiqueue devices.
+ Enable the .prio interface for assigning an I/O priority class to
+ requests. The I/O priority class affects the order in which an I/O
+ scheduler and block devices process requests. Only some I/O schedulers
+ and some block devices support I/O priorities.
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 2f2158e05a91..64053d67a97b 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -9,6 +9,12 @@ config MQ_IOSCHED_DEADLINE
help
MQ version of the deadline IO scheduler.
+config MQ_IOSCHED_DEADLINE_CGROUP
+ tristate
+ default y
+ depends on MQ_IOSCHED_DEADLINE
+ depends on BLK_CGROUP
+
config MQ_IOSCHED_KYBER
tristate "Kyber I/O scheduler"
default y
diff --git a/block/Makefile b/block/Makefile
index 8d841f5f986f..bfbe4e13ca1e 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-exec.o blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
- genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
+ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
+ disk-events.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -17,9 +18,12 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
+mq-deadline-y += mq-deadline-main.o
+mq-deadline-$(CONFIG_MQ_IOSCHED_DEADLINE_CGROUP)+= mq-deadline-cgroup.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index acd1f881273e..727955918563 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -364,6 +364,16 @@ static int ref_wr_duration[2];
*/
static const unsigned long max_service_from_wr = 120000;
+/*
+ * Maximum time between the creation of two queues, for stable merge
+ * to be activated (in ms)
+ */
+static const unsigned long bfq_activation_stable_merging = 600;
+/*
+ * Minimum time to be waited before evaluating delayed stable merge (in ms)
+ */
+static const unsigned long bfq_late_stable_merging = 600;
+
#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
@@ -1729,10 +1739,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfqq->entity.new_weight == 40;
*interactive = !in_burst && idle_for_long_time &&
bfqq->entity.new_weight == 40;
+ /*
+ * Merged bfq_queues are kept out of weight-raising
+ * (low-latency) mechanisms. The reason is that these queues
+ * are usually created for non-interactive and
+ * non-soft-real-time tasks. Yet this is not the case for
+ * stably-merged queues. These queues are merged just because
+ * they are created shortly after each other. So they may
+ * easily serve the I/O of an interactive or soft-real time
+ * application, if the application happens to spawn multiple
+ * processes. So let also stably-merged queued enjoy weight
+ * raising.
+ */
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
(bfq_bfqq_sync(bfqq) &&
- bfqq->bic && (*interactive || soft_rt)));
+ (bfqq->bic || RQ_BIC(rq)->stably_merged) &&
+ (*interactive || soft_rt)));
/*
* Using the last flag, update budget and check whether bfqq
@@ -1962,14 +1985,18 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
* Turning back to the detection of a waker queue, a queue Q is deemed
* as a waker queue for bfqq if, for three consecutive times, bfqq
* happens to become non empty right after a request of Q has been
- * completed. In particular, on the first time, Q is tentatively set
- * as a candidate waker queue, while on the third consecutive time
- * that Q is detected, the field waker_bfqq is set to Q, to confirm
- * that Q is a waker queue for bfqq. These detection steps are
- * performed only if bfqq has a long think time, so as to make it more
- * likely that bfqq's I/O is actually being blocked by a
- * synchronization. This last filter, plus the above three-times
- * requirement, make false positives less likely.
+ * completed. In this respect, even if bfqq is empty, we do not check
+ * for a waker if it still has some in-flight I/O. In fact, in this
+ * case bfqq is actually still being served by the drive, and may
+ * receive new I/O on the completion of some of the in-flight
+ * requests. In particular, on the first time, Q is tentatively set as
+ * a candidate waker queue, while on the third consecutive time that Q
+ * is detected, the field waker_bfqq is set to Q, to confirm that Q is
+ * a waker queue for bfqq. These detection steps are performed only if
+ * bfqq has a long think time, so as to make it more likely that
+ * bfqq's I/O is actually being blocked by a synchronization. This
+ * last filter, plus the above three-times requirement, make false
+ * positives less likely.
*
* NOTE
*
@@ -1995,6 +2022,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
+ bfqq->dispatched > 0 ||
now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
return;
@@ -2317,9 +2345,9 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ spin_unlock_irq(&bfqd->lock);
if (free)
blk_mq_free_request(free);
- spin_unlock_irq(&bfqd->lock);
return ret;
}
@@ -2405,7 +2433,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
*next_bfqq = bfq_init_rq(next);
if (!bfqq)
- return;
+ goto remove;
/*
* If next and rq belong to the same bfq_queue and next is older
@@ -2428,6 +2456,14 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
bfqq->next_rq = rq;
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+remove:
+ /* Merged request may be in the IO scheduler. Remove it. */
+ if (!RB_EMPTY_NODE(&next->rb_node)) {
+ bfq_remove_request(next->q, next);
+ if (next_bfqq)
+ bfqg_stats_update_io_remove(bfqq_group(next_bfqq),
+ next->cmd_flags);
+ }
}
/* Must be called with bfqq != NULL */
@@ -2695,10 +2731,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* costly and complicated.
*/
if (unlikely(!bfqd->nonrot_with_queueing)) {
- if (bic->stable_merge_bfqq &&
+ /*
+ * Make sure also that bfqq is sync, because
+ * bic->stable_merge_bfqq may point to some queue (for
+ * stable merging) also if bic is associated with a
+ * sync queue, but this bfqq is async
+ */
+ if (bfq_bfqq_sync(bfqq) && bic->stable_merge_bfqq &&
!bfq_bfqq_just_created(bfqq) &&
- time_is_after_jiffies(bfqq->split_time +
- msecs_to_jiffies(200))) {
+ time_is_before_jiffies(bfqq->split_time +
+ msecs_to_jiffies(bfq_late_stable_merging)) &&
+ time_is_before_jiffies(bfqq->creation_time +
+ msecs_to_jiffies(bfq_late_stable_merging))) {
struct bfq_queue *stable_merge_bfqq =
bic->stable_merge_bfqq;
int proc_ref = min(bfqq_process_refs(bfqq),
@@ -5479,7 +5523,7 @@ static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
*/
if (!last_bfqq_created ||
time_before(last_bfqq_created->creation_time +
- bfqd->bfq_burst_interval,
+ msecs_to_jiffies(bfq_activation_stable_merging),
bfqq->creation_time) ||
bfqq->entity.parent != last_bfqq_created->entity.parent ||
bfqq->ioprio != last_bfqq_created->ioprio ||
@@ -5925,14 +5969,16 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
unsigned int cmd_flags;
+ LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
bfqg_stats_update_legacy_io(q, rq);
#endif
spin_lock_irq(&bfqd->lock);
- if (blk_mq_sched_try_insert_merge(q, rq)) {
+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
spin_unlock_irq(&bfqd->lock);
+ blk_mq_free_requests(&free);
return;
}
@@ -6129,11 +6175,13 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
* of other queues. But a false waker will unjustly steal
* bandwidth to its supposedly woken queue. So considering
* also shared queues in the waking mechanism may cause more
- * control troubles than throughput benefits. Then do not set
- * last_completed_rq_bfqq to bfqq if bfqq is a shared queue.
+ * control troubles than throughput benefits. Then reset
+ * last_completed_rq_bfqq if bfqq is a shared queue.
*/
if (!bfq_bfqq_coop(bfqq))
bfqd->last_completed_rq_bfqq = bfqq;
+ else
+ bfqd->last_completed_rq_bfqq = NULL;
/*
* If we are waiting to discover whether the request pattern
@@ -6376,6 +6424,7 @@ static void bfq_finish_requeue_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd;
+ unsigned long flags;
/*
* rq either is not associated with any icq, or is an already
@@ -6393,39 +6442,15 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->io_start_time_ns,
rq->cmd_flags);
+ spin_lock_irqsave(&bfqd->lock, flags);
if (likely(rq->rq_flags & RQF_STARTED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&bfqd->lock, flags);
-
if (rq == bfqd->waited_rq)
bfq_update_inject_limit(bfqd, bfqq);
bfq_completed_request(bfqq, bfqd);
- bfq_finish_requeue_request_body(bfqq);
-
- spin_unlock_irqrestore(&bfqd->lock, flags);
- } else {
- /*
- * Request rq may be still/already in the scheduler,
- * in which case we need to remove it (this should
- * never happen in case of requeue). And we cannot
- * defer such a check and removal, to avoid
- * inconsistencies in the time interval from the end
- * of this function to the start of the deferred work.
- * This situation seems to occur only in process
- * context, as a consequence of a merge. In the
- * current version of the code, this implies that the
- * lock is held.
- */
-
- if (!RB_EMPTY_NODE(&rq->rb_node)) {
- bfq_remove_request(rq->q, rq);
- bfqg_stats_update_io_remove(bfqq_group(bfqq),
- rq->cmd_flags);
- }
- bfq_finish_requeue_request_body(bfqq);
}
+ bfq_finish_requeue_request_body(bfqq);
+ spin_unlock_irqrestore(&bfqd->lock, flags);
/*
* Reset private fields. In case of a requeue, this allows
diff --git a/block/bio.c b/block/bio.c
index 44205dfb6b60..1fab762e079b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1375,8 +1375,7 @@ static inline bool bio_remaining_done(struct bio *bio)
*
* bio_endio() can be called several times on a bio that has been chained
* using bio_chain(). The ->bi_end_io() function will only be called the
- * last time. At this point the BLK_TA_COMPLETE tracing event will be
- * generated if BIO_TRACE_COMPLETION is set.
+ * last time.
**/
void bio_endio(struct bio *bio)
{
@@ -1389,6 +1388,11 @@ again:
if (bio->bi_bdev)
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ }
+
/*
* Need to have a real endio function for chained bios, otherwise
* various corner cases will break (like stacking block devices that
@@ -1402,11 +1406,6 @@ again:
goto again;
}
- if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
- bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- }
-
blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 582d2f18717e..7b06a5fa3cac 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -31,6 +31,7 @@
#include <linux/tracehook.h>
#include <linux/psi.h>
#include "blk.h"
+#include "blk-ioprio.h"
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -1183,15 +1184,18 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
- ret = blk_throtl_init(q);
+ ret = blk_iolatency_init(q);
if (ret)
goto err_destroy_all;
- ret = blk_iolatency_init(q);
- if (ret) {
- blk_throtl_exit(q);
+ ret = blk_ioprio_init(q);
+ if (ret)
goto err_destroy_all;
- }
+
+ ret = blk_throtl_init(q);
+ if (ret)
+ goto err_destroy_all;
+
return 0;
err_destroy_all:
@@ -1217,32 +1221,6 @@ void blkcg_exit_queue(struct request_queue *q)
blk_throtl_exit(q);
}
-/*
- * We cannot support shared io contexts, as we have no mean to support
- * two tasks with the same ioc in two different groups without major rework
- * of the main cic data structures. For now we allow a task to change
- * its cgroup only if it's the only owner of its ioc.
- */
-static int blkcg_can_attach(struct cgroup_taskset *tset)
-{
- struct task_struct *task;
- struct cgroup_subsys_state *dst_css;
- struct io_context *ioc;
- int ret = 0;
-
- /* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, dst_css, tset) {
- task_lock(task);
- ioc = task->io_context;
- if (ioc && atomic_read(&ioc->nr_tasks) > 1)
- ret = -EINVAL;
- task_unlock(task);
- if (ret)
- break;
- }
- return ret;
-}
-
static void blkcg_bind(struct cgroup_subsys_state *root_css)
{
int i;
@@ -1275,7 +1253,6 @@ struct cgroup_subsys io_cgrp_subsys = {
.css_online = blkcg_css_online,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
- .can_attach = blkcg_can_attach,
.css_rstat_flush = blkcg_rstat_flush,
.bind = blkcg_bind,
.dfl_cftypes = blkcg_files,
diff --git a/block/blk-core.c b/block/blk-core.c
index 9bcdae93f6d4..514838ccab2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -599,7 +599,6 @@ fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
-EXPORT_SYMBOL(blk_alloc_queue);
/**
* blk_get_queue - increment the request_queue refcount
@@ -1086,15 +1085,6 @@ blk_qc_t submit_bio(struct bio *bio)
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
-
- if (unlikely(block_dump)) {
- char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
- current->comm, task_pid_nr(current),
- op_is_write(bio_op(bio)) ? "WRITE" : "READ",
- (unsigned long long)bio->bi_iter.bi_sector,
- bio_devname(bio, b), count);
- }
}
/*
@@ -1394,26 +1384,22 @@ void blk_steal_bios(struct bio_list *list, struct request *rq)
EXPORT_SYMBOL_GPL(blk_steal_bios);
/**
- * blk_update_request - Special helper function for request stacking drivers
+ * blk_update_request - Complete multiple bytes without completing the request
* @req: the request being processed
* @error: block status code
- * @nr_bytes: number of bytes to complete @req
+ * @nr_bytes: number of bytes to complete for @req
*
* Description:
* Ends I/O on a number of bytes attached to @req, but doesn't complete
* the request structure even if @req doesn't have leftover.
* If @req has leftover, sets it up for the next range of segments.
*
- * This special helper function is only for request stacking drivers
- * (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_mq_end_request instead.
- *
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
*
* Note:
- * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
- * blk_rq_bytes() and in blk_update_request().
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ * except in the consistency check at the end of this function.
*
* Return:
* %false - this request doesn't have any more data
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 7942ca6ed321..1002f6c58181 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -219,8 +219,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
- blk_account_io_flush(flush_rq);
-
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
@@ -230,6 +228,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
return;
}
+ blk_account_io_flush(flush_rq);
/*
* Flush request has to be marked as IDLE when it is really ended
* because its .end_io() is called from timeout code path too for
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
new file mode 100644
index 000000000000..332a07761bf8
--- /dev/null
+++ b/block/blk-ioprio.c
@@ -0,0 +1,262 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Block rq-qos policy for assigning an I/O priority class to requests.
+ *
+ * Using an rq-qos policy for assigning I/O priority class has two advantages
+ * over using the ioprio_set() system call:
+ *
+ * - This policy is cgroup based so it has all the advantages of cgroups.
+ * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos
+ * controller affects page cache writeback I/O for filesystems that support
+ * assiociating a cgroup with writeback I/O. See also
+ * Documentation/admin-guide/cgroup-v2.rst.
+ */
+
+#include <linux/blk-cgroup.h>
+#include <linux/blk-mq.h>
+#include <linux/blk_types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include "blk-ioprio.h"
+#include "blk-rq-qos.h"
+
+/**
+ * enum prio_policy - I/O priority class policy.
+ * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
+ * @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT.
+ * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
+ * IOPRIO_CLASS_BE.
+ * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
+ *
+ * See also <linux/ioprio.h>.
+ */
+enum prio_policy {
+ POLICY_NO_CHANGE = 0,
+ POLICY_NONE_TO_RT = 1,
+ POLICY_RESTRICT_TO_BE = 2,
+ POLICY_ALL_TO_IDLE = 3,
+};
+
+static const char *policy_name[] = {
+ [POLICY_NO_CHANGE] = "no-change",
+ [POLICY_NONE_TO_RT] = "none-to-rt",
+ [POLICY_RESTRICT_TO_BE] = "restrict-to-be",
+ [POLICY_ALL_TO_IDLE] = "idle",
+};
+
+static struct blkcg_policy ioprio_policy;
+
+/**
+ * struct ioprio_blkg - Per (cgroup, request queue) data.
+ * @pd: blkg_policy_data structure.
+ */
+struct ioprio_blkg {
+ struct blkg_policy_data pd;
+};
+
+/**
+ * struct ioprio_blkcg - Per cgroup data.
+ * @cpd: blkcg_policy_data structure.
+ * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
+ */
+struct ioprio_blkcg {
+ struct blkcg_policy_data cpd;
+ enum prio_policy prio_policy;
+};
+
+static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
+}
+
+static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
+{
+ return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
+ struct ioprio_blkcg, cpd);
+}
+
+static struct ioprio_blkcg *
+ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
+{
+ return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
+}
+
+static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
+{
+ struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
+
+ if (!pd)
+ return NULL;
+
+ return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
+}
+
+static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
+
+ seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
+ return 0;
+}
+
+static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
+ int ret;
+
+ if (off != 0)
+ return -EIO;
+ /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
+ ret = sysfs_match_string(policy_name, buf);
+ if (ret < 0)
+ return ret;
+ blkcg->prio_policy = ret;
+
+ return nbytes;
+}
+
+static struct blkg_policy_data *
+ioprio_alloc_pd(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg)
+{
+ struct ioprio_blkg *ioprio_blkg;
+
+ ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
+ if (!ioprio_blkg)
+ return NULL;
+
+ return &ioprio_blkg->pd;
+}
+
+static void ioprio_free_pd(struct blkg_policy_data *pd)
+{
+ struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
+
+ kfree(ioprio_blkg);
+}
+
+static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
+{
+ struct ioprio_blkcg *blkcg;
+
+ blkcg = kzalloc(sizeof(*blkcg), gfp);
+ if (!blkcg)
+ return NULL;
+ blkcg->prio_policy = POLICY_NO_CHANGE;
+ return &blkcg->cpd;
+}
+
+static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
+{
+ struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd);
+
+ kfree(blkcg);
+}
+
+#define IOPRIO_ATTRS \
+ { \
+ .name = "prio.class", \
+ .seq_show = ioprio_show_prio_policy, \
+ .write = ioprio_set_prio_policy, \
+ }, \
+ { } /* sentinel */
+
+/* cgroup v2 attributes */
+static struct cftype ioprio_files[] = {
+ IOPRIO_ATTRS
+};
+
+/* cgroup v1 attributes */
+static struct cftype ioprio_legacy_files[] = {
+ IOPRIO_ATTRS
+};
+
+static struct blkcg_policy ioprio_policy = {
+ .dfl_cftypes = ioprio_files,
+ .legacy_cftypes = ioprio_legacy_files,
+
+ .cpd_alloc_fn = ioprio_alloc_cpd,
+ .cpd_free_fn = ioprio_free_cpd,
+
+ .pd_alloc_fn = ioprio_alloc_pd,
+ .pd_free_fn = ioprio_free_pd,
+};
+
+struct blk_ioprio {
+ struct rq_qos rqos;
+};
+
+static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
+ struct bio *bio)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+
+ /*
+ * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
+ * correspond to a lower priority. Hence, the max_t() below selects
+ * the lower priority of bi_ioprio and the cgroup I/O priority class.
+ * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the
+ * bio I/O priority is not modified. If the bio I/O priority equals
+ * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
+ */
+ bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
+ IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+}
+
+static void blkcg_ioprio_exit(struct rq_qos *rqos)
+{
+ struct blk_ioprio *blkioprio_blkg =
+ container_of(rqos, typeof(*blkioprio_blkg), rqos);
+
+ blkcg_deactivate_policy(rqos->q, &ioprio_policy);
+ kfree(blkioprio_blkg);
+}
+
+static struct rq_qos_ops blkcg_ioprio_ops = {
+ .track = blkcg_ioprio_track,
+ .exit = blkcg_ioprio_exit,
+};
+
+int blk_ioprio_init(struct request_queue *q)
+{
+ struct blk_ioprio *blkioprio_blkg;
+ struct rq_qos *rqos;
+ int ret;
+
+ blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL);
+ if (!blkioprio_blkg)
+ return -ENOMEM;
+
+ ret = blkcg_activate_policy(q, &ioprio_policy);
+ if (ret) {
+ kfree(blkioprio_blkg);
+ return ret;
+ }
+
+ rqos = &blkioprio_blkg->rqos;
+ rqos->id = RQ_QOS_IOPRIO;
+ rqos->ops = &blkcg_ioprio_ops;
+ rqos->q = q;
+
+ /*
+ * Registering the rq-qos policy after activating the blk-cgroup
+ * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the
+ * rq-qos callbacks.
+ */
+ rq_qos_add(q, rqos);
+
+ return 0;
+}
+
+static int __init ioprio_init(void)
+{
+ return blkcg_policy_register(&ioprio_policy);
+}
+
+static void __exit ioprio_exit(void)
+{
+ blkcg_policy_unregister(&ioprio_policy);
+}
+
+module_init(ioprio_init);
+module_exit(ioprio_exit);
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
new file mode 100644
index 000000000000..a7785c2f1aea
--- /dev/null
+++ b/block/blk-ioprio.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BLK_IOPRIO_H_
+#define _BLK_IOPRIO_H_
+
+#include <linux/kconfig.h>
+
+struct request_queue;
+
+#ifdef CONFIG_BLK_CGROUP_IOPRIO
+int blk_ioprio_init(struct request_queue *q);
+#else
+static inline int blk_ioprio_init(struct request_queue *q)
+{
+ return 0;
+}
+#endif
+
+#endif /* _BLK_IOPRIO_H_ */
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4d97fb6dd226..a11b3b53717e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -559,10 +559,14 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
- if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
+ if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
- if (blk_integrity_merge_bio(req->q, req, bio) == false)
+ /* discard request merge won't add new segment */
+ if (req_op(req) == REQ_OP_DISCARD)
+ return 1;
+
+ if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
goto no_merge;
/*
@@ -846,18 +850,15 @@ static struct request *attempt_front_merge(struct request_queue *q,
return NULL;
}
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
- struct request *next)
+/*
+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false
+ * otherwise. The caller is responsible for freeing 'next' if the merge
+ * happened.
+ */
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next)
{
- struct request *free;
-
- free = attempt_merge(q, rq, next);
- if (free) {
- blk_put_request(free);
- return 1;
- }
-
- return 0;
+ return attempt_merge(q, rq, next);
}
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 2a75bc7401df..4b66d2776eda 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -937,6 +937,21 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q)
q->sched_debugfs_dir = NULL;
}
+static const char *rq_qos_id_to_name(enum rq_qos_id id)
+{
+ switch (id) {
+ case RQ_QOS_WBT:
+ return "wbt";
+ case RQ_QOS_LATENCY:
+ return "latency";
+ case RQ_QOS_COST:
+ return "cost";
+ case RQ_QOS_IOPRIO:
+ return "ioprio";
+ }
+ return "unknown";
+}
+
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
debugfs_remove_recursive(rqos->debugfs_dir);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 996a4b2f73aa..c838d81ac058 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -168,9 +168,19 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* in blk_mq_dispatch_rq_list().
*/
list_add_tail(&rq->queuelist, &rq_list);
+ count++;
if (rq->mq_hctx != hctx)
multi_hctxs = true;
- } while (++count < max_dispatch);
+
+ /*
+ * If we cannot get tag for the request, stop dequeueing
+ * requests from the IO scheduler. We are unlikely to be able
+ * to submit them anyway and it creates false impression for
+ * scheduling heuristics that the device can take more IO.
+ */
+ if (!blk_mq_get_driver_tag(rq))
+ break;
+ } while (count < max_dispatch);
if (!count) {
if (run_queue)
@@ -284,8 +294,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
- struct elevator_queue *e = q->elevator;
- const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+ const bool has_sched = q->elevator;
int ret = 0;
LIST_HEAD(rq_list);
@@ -316,12 +325,12 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) {
- if (has_sched_dispatch)
+ if (has_sched)
ret = blk_mq_do_dispatch_sched(hctx);
else
ret = blk_mq_do_dispatch_ctx(hctx);
}
- } else if (has_sched_dispatch) {
+ } else if (has_sched) {
ret = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
@@ -390,9 +399,10 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
return ret;
}
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free)
{
- return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
@@ -453,7 +463,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
goto run;
}
- if (e && e->type->ops.insert_requests) {
+ if (e) {
LIST_HEAD(list);
list_add(&rq->queuelist, &list);
@@ -484,9 +494,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_get(&q->q_usage_counter);
e = hctx->queue->elevator;
- if (e && e->type->ops.insert_requests)
+ if (e) {
e->type->ops.insert_requests(hctx, list, false);
- else {
+ } else {
/*
* try to issue requests directly if the hw queue isn't
* busy in case of 'none' scheduler, and this way may save
@@ -509,11 +519,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
- unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
-
if (hctx->sched_tags) {
blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
- blk_mq_free_rq_map(hctx->sched_tags, flags);
+ blk_mq_free_rq_map(hctx->sched_tags, set->flags);
hctx->sched_tags = NULL;
}
}
@@ -523,12 +531,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
unsigned int hctx_idx)
{
struct blk_mq_tag_set *set = q->tag_set;
- /* Clear HCTX_SHARED so tags are init'ed */
- unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
- set->reserved_tags, flags);
+ set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
@@ -546,16 +552,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
int i;
queue_for_each_hw_ctx(q, hctx, i) {
- /* Clear HCTX_SHARED so tags are freed */
- unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
-
if (hctx->sched_tags) {
- blk_mq_free_rq_map(hctx->sched_tags, flags);
+ blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
hctx->sched_tags = NULL;
}
}
}
+static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+{
+ struct blk_mq_tag_set *set = queue->tag_set;
+ int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
+ struct blk_mq_hw_ctx *hctx;
+ int ret, i;
+
+ /*
+ * Set initial depth at max so that we don't need to reallocate for
+ * updating nr_requests.
+ */
+ ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
+ &queue->sched_breserved_tags,
+ MAX_SCHED_RQ, set->reserved_tags,
+ set->numa_node, alloc_policy);
+ if (ret)
+ return ret;
+
+ queue_for_each_hw_ctx(queue, hctx, i) {
+ hctx->sched_tags->bitmap_tags =
+ &queue->sched_bitmap_tags;
+ hctx->sched_tags->breserved_tags =
+ &queue->sched_breserved_tags;
+ }
+
+ sbitmap_queue_resize(&queue->sched_bitmap_tags,
+ queue->nr_requests - set->reserved_tags);
+
+ return 0;
+}
+
+static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
+{
+ sbitmap_queue_free(&queue->sched_bitmap_tags);
+ sbitmap_queue_free(&queue->sched_breserved_tags);
+}
+
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct blk_mq_hw_ctx *hctx;
@@ -580,12 +620,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
if (ret)
- goto err;
+ goto err_free_tags;
+ }
+
+ if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+ ret = blk_mq_init_sched_shared_sbitmap(q);
+ if (ret)
+ goto err_free_tags;
}
ret = e->ops.init_sched(q, e);
if (ret)
- goto err;
+ goto err_free_sbitmap;
blk_mq_debugfs_register_sched(q);
@@ -605,7 +651,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
-err:
+err_free_sbitmap:
+ if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+ blk_mq_exit_sched_shared_sbitmap(q);
+err_free_tags:
blk_mq_sched_free_requests(q);
blk_mq_sched_tags_teardown(q);
q->elevator = NULL;
@@ -631,6 +680,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
unsigned int i;
+ unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
blk_mq_debugfs_unregister_sched_hctx(hctx);
@@ -638,10 +688,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
}
+ flags = hctx->flags;
}
blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q);
+ if (blk_mq_is_sbitmap_shared(flags))
+ blk_mq_exit_sched_shared_sbitmap(q);
q->elevator = NULL;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 5b18ab915c65..5246ae040704 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -5,13 +5,16 @@
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+
void blk_mq_sched_assign_ioc(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2a37731e8244..86f87346232a 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -13,6 +13,7 @@
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
+#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
/*
@@ -199,6 +200,20 @@ struct bt_iter_data {
bool reserved;
};
+static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
+ unsigned int bitnr)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tags->lock, flags);
+ rq = tags->rqs[bitnr];
+ if (!rq || !refcount_inc_not_zero(&rq->ref))
+ rq = NULL;
+ spin_unlock_irqrestore(&tags->lock, flags);
+ return rq;
+}
+
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
@@ -206,18 +221,22 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_tags *tags = hctx->tags;
bool reserved = iter_data->reserved;
struct request *rq;
+ bool ret = true;
if (!reserved)
bitnr += tags->nr_reserved_tags;
- rq = tags->rqs[bitnr];
-
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
- return iter_data->fn(hctx, rq, iter_data->data, reserved);
- return true;
+ rq = blk_mq_find_and_get_req(tags, bitnr);
+ if (!rq)
+ return true;
+
+ if (rq->q == hctx->queue && rq->mq_hctx == hctx)
+ ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
+ blk_mq_put_rq_ref(rq);
+ return ret;
}
/**
@@ -264,6 +283,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
struct blk_mq_tags *tags = iter_data->tags;
bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
+ bool ret = true;
+ bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
if (!reserved)
bitnr += tags->nr_reserved_tags;
@@ -272,16 +293,19 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
+ if (iter_static_rqs)
rq = tags->static_rqs[bitnr];
else
- rq = tags->rqs[bitnr];
+ rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
- if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
- !blk_mq_request_started(rq))
- return true;
- return iter_data->fn(rq, iter_data->data, reserved);
+
+ if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
+ blk_mq_request_started(rq))
+ ret = iter_data->fn(rq, iter_data->data, reserved);
+ if (!iter_static_rqs)
+ blk_mq_put_rq_ref(rq);
+ return ret;
}
/**
@@ -348,6 +372,9 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
+ *
+ * We grab one request reference before calling @fn and release it after
+ * @fn returns.
*/
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
@@ -445,39 +472,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node);
}
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
- int node, int alloc_policy)
+int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
+ struct sbitmap_queue *breserved_tags,
+ unsigned int queue_depth, unsigned int reserved,
+ int node, int alloc_policy)
{
- unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+ unsigned int depth = queue_depth - reserved;
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node))
+ if (bt_alloc(bitmap_tags, depth, round_robin, node))
return -ENOMEM;
- if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags,
- round_robin, node))
+ if (bt_alloc(breserved_tags, reserved, round_robin, node))
goto free_bitmap_tags;
+ return 0;
+
+free_bitmap_tags:
+ sbitmap_queue_free(bitmap_tags);
+ return -ENOMEM;
+}
+
+static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
+ int node, int alloc_policy)
+{
+ int ret;
+
+ ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
+ &tags->__breserved_tags,
+ tags->nr_tags, tags->nr_reserved_tags,
+ node, alloc_policy);
+ if (ret)
+ return ret;
+
tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
-free_bitmap_tags:
- sbitmap_queue_free(&tags->__bitmap_tags);
- return -ENOMEM;
}
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
+int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
{
- unsigned int depth = set->queue_depth - set->reserved_tags;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
- bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- int i, node = set->numa_node;
+ int i, ret;
- if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
- return -ENOMEM;
- if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
- round_robin, node))
- goto free_bitmap_tags;
+ ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
+ set->queue_depth, set->reserved_tags,
+ set->numa_node, alloc_policy);
+ if (ret)
+ return ret;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
@@ -487,9 +529,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
}
return 0;
-free_bitmap_tags:
- sbitmap_queue_free(&set->__bitmap_tags);
- return -ENOMEM;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
@@ -516,6 +555,7 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
+ spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags))
return tags;
@@ -551,8 +591,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
*/
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
- /* Only sched tags can grow, so clear HCTX_SHARED flag */
- unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
struct blk_mq_tags *new;
bool ret;
@@ -563,21 +601,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
* We need some sort of upper limit, set it high enough that
* no valid use cases should require more.
*/
- if (tdepth > 16 * BLKDEV_MAX_RQ)
+ if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
- tags->nr_reserved_tags, flags);
+ tags->nr_reserved_tags, set->flags);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
- blk_mq_free_rq_map(new, flags);
+ blk_mq_free_rq_map(new, set->flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
- blk_mq_free_rq_map(*tagsptr, flags);
+ blk_mq_free_rq_map(*tagsptr, set->flags);
*tagsptr = new;
} else {
/*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 7d3e6b333a4a..8ed55af08427 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -20,17 +20,26 @@ struct blk_mq_tags {
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
+
+ /*
+ * used to clear request reference in rqs[] before freeing one
+ * request pool
+ */
+ spinlock_t lock;
};
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags,
int node, unsigned int flags);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
+ struct sbitmap_queue *breserved_tags,
+ unsigned int queue_depth,
+ unsigned int reserved,
+ int node, int alloc_policy);
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set,
- unsigned int flags);
+extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
-
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e41edae97487..2e9fd0ec63d7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -909,6 +909,14 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
}
+void blk_mq_put_rq_ref(struct request *rq)
+{
+ if (is_flush_rq(rq, rq->mq_hctx))
+ rq->end_io(rq, 0);
+ else if (refcount_dec_and_test(&rq->ref))
+ __blk_mq_free_request(rq);
+}
+
static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
struct request *rq, void *priv, bool reserved)
{
@@ -942,11 +950,7 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
- if (is_flush_rq(rq, hctx))
- rq->end_io(rq, 0);
- else if (refcount_dec_and_test(&rq->ref))
- __blk_mq_free_request(rq);
-
+ blk_mq_put_rq_ref(rq);
return true;
}
@@ -1100,7 +1104,7 @@ static bool __blk_mq_get_driver_tag(struct request *rq)
return true;
}
-static bool blk_mq_get_driver_tag(struct request *rq)
+bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -1220,9 +1224,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
- if (hctx->queue->elevator)
- return;
-
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
@@ -2303,6 +2304,45 @@ queue_exit:
return BLK_QC_T_NONE;
}
+static size_t order_to_size(unsigned int order)
+{
+ return (size_t)PAGE_SIZE << order;
+}
+
+/* called before freeing request pool in @tags */
+static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags, unsigned int hctx_idx)
+{
+ struct blk_mq_tags *drv_tags = set->tags[hctx_idx];
+ struct page *page;
+ unsigned long flags;
+
+ list_for_each_entry(page, &tags->page_list, lru) {
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + order_to_size(page->private);
+ int i;
+
+ for (i = 0; i < set->queue_depth; i++) {
+ struct request *rq = drv_tags->rqs[i];
+ unsigned long rq_addr = (unsigned long)rq;
+
+ if (rq_addr >= start && rq_addr < end) {
+ WARN_ON_ONCE(refcount_read(&rq->ref) != 0);
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
+ }
+ }
+ }
+
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&drv_tags->lock, flags);
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
+}
+
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
@@ -2321,6 +2361,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
+ blk_mq_clear_rq_mapping(set, tags, hctx_idx);
+
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
@@ -2380,11 +2422,6 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
return tags;
}
-static size_t order_to_size(unsigned int order)
-{
- return (size_t)PAGE_SIZE << order;
-}
-
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, int node)
{
@@ -2603,16 +2640,49 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
&hctx->cpuhp_dead);
}
+/*
+ * Before freeing hw queue, clearing the flush request reference in
+ * tags->rqs[] for avoiding potential UAF.
+ */
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
+ unsigned int queue_depth, struct request *flush_rq)
+{
+ int i;
+ unsigned long flags;
+
+ /* The hw queue may not be mapped yet */
+ if (!tags)
+ return;
+
+ WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0);
+
+ for (i = 0; i < queue_depth; i++)
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
+
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&tags->lock, flags);
+ spin_unlock_irqrestore(&tags->lock, flags);
+}
+
/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ struct request *flush_rq = hctx->fq->flush_rq;
+
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
+ set->queue_depth, flush_rq);
if (set->ops->exit_request)
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
+ set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -3042,21 +3112,18 @@ void blk_mq_release(struct request_queue *q)
struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
- struct request_queue *uninit_q, *q;
+ struct request_queue *q;
+ int ret;
- uninit_q = blk_alloc_queue(set->numa_node);
- if (!uninit_q)
+ q = blk_alloc_queue(set->numa_node);
+ if (!q)
return ERR_PTR(-ENOMEM);
- uninit_q->queuedata = queuedata;
-
- /*
- * Initialize the queue without an elevator. device_add_disk() will do
- * the initialization.
- */
- q = blk_mq_init_allocated_queue(set, uninit_q, false);
- if (IS_ERR(q))
- blk_cleanup_queue(uninit_q);
-
+ q->queuedata = queuedata;
+ ret = blk_mq_init_allocated_queue(set, q);
+ if (ret) {
+ blk_cleanup_queue(q);
+ return ERR_PTR(ret);
+ }
return q;
}
EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
@@ -3067,39 +3134,24 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
-/*
- * Helper for setting up a queue with mq ops, given queue depth, and
- * the passed in mq ops flags.
- */
-struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops,
- unsigned int queue_depth,
- unsigned int set_flags)
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata)
{
struct request_queue *q;
- int ret;
+ struct gendisk *disk;
- memset(set, 0, sizeof(*set));
- set->ops = ops;
- set->nr_hw_queues = 1;
- set->nr_maps = 1;
- set->queue_depth = queue_depth;
- set->numa_node = NUMA_NO_NODE;
- set->flags = set_flags;
-
- ret = blk_mq_alloc_tag_set(set);
- if (ret)
- return ERR_PTR(ret);
+ q = blk_mq_init_queue_data(set, queuedata);
+ if (IS_ERR(q))
+ return ERR_CAST(q);
- q = blk_mq_init_queue(set);
- if (IS_ERR(q)) {
- blk_mq_free_tag_set(set);
- return q;
+ disk = __alloc_disk_node(0, set->numa_node);
+ if (!disk) {
+ blk_cleanup_queue(q);
+ return ERR_PTR(-ENOMEM);
}
-
- return q;
+ disk->queue = q;
+ return disk;
}
-EXPORT_SYMBOL(blk_mq_init_sq_queue);
+EXPORT_SYMBOL(__blk_mq_alloc_disk);
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
@@ -3212,9 +3264,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
mutex_unlock(&q->sysfs_lock);
}
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q,
- bool elevator_init)
+int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -3264,11 +3315,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
-
- if (elevator_init)
- elevator_init_mq(q);
-
- return q;
+ return 0;
err_hctxs:
kfree(q->queue_hw_ctx);
@@ -3279,7 +3326,7 @@ err_poll:
q->poll_cb = NULL;
err_exit:
q->mq_ops = NULL;
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -3491,7 +3538,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (blk_mq_is_sbitmap_shared(set->flags)) {
atomic_set(&set->active_queues_shared_sbitmap, 0);
- if (blk_mq_init_shared_sbitmap(set, set->flags)) {
+ if (blk_mq_init_shared_sbitmap(set)) {
ret = -ENOMEM;
goto out_free_mq_rq_maps;
}
@@ -3516,6 +3563,22 @@ out_free_mq_map:
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+/* allocate and initialize a tagset for a simple single-queue device */
+int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
+ const struct blk_mq_ops *ops, unsigned int queue_depth,
+ unsigned int set_flags)
+{
+ memset(set, 0, sizeof(*set));
+ set->ops = ops;
+ set->nr_hw_queues = 1;
+ set->nr_maps = 1;
+ set->queue_depth = queue_depth;
+ set->numa_node = NUMA_NO_NODE;
+ set->flags = set_flags;
+ return blk_mq_alloc_tag_set(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
+
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i, j;
@@ -3567,15 +3630,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
} else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true);
+ if (blk_mq_is_sbitmap_shared(set->flags)) {
+ hctx->sched_tags->bitmap_tags =
+ &q->sched_bitmap_tags;
+ hctx->sched_tags->breserved_tags =
+ &q->sched_breserved_tags;
+ }
}
if (ret)
break;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
}
-
- if (!ret)
+ if (!ret) {
q->nr_requests = nr;
+ if (q->elevator && blk_mq_is_sbitmap_shared(set->flags))
+ sbitmap_queue_resize(&q->sched_bitmap_tags,
+ nr - set->reserved_tags);
+ }
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9ce64bc4a6c8..d08779f77a26 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -47,6 +47,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
+void blk_mq_put_rq_ref(struct request *rq);
/*
* Internal helpers for allocating/freeing the request map
@@ -259,6 +260,8 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
+bool blk_mq_get_driver_tag(struct request *rq);
+
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
int cpu;
@@ -299,6 +302,17 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
return NULL;
}
+/* Free all requests on the list */
+static inline void blk_mq_free_requests(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct request *rq = list_entry_rq(list->next);
+
+ list_del_init(&rq->queuelist);
+ blk_mq_free_request(rq);
+ }
+}
+
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 656460636ad3..e83af7bc7591 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -266,8 +266,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
return;
- prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
- has_sleeper = !wq_has_single_sleeper(&rqw->wait);
+ has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+ TASK_UNINTERRUPTIBLE);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 2bc43e94f4c4..f000f83e0621 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -7,6 +7,7 @@
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
+#include <linux/blk-mq.h>
#include "blk-mq-debugfs.h"
@@ -16,6 +17,7 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
+ RQ_QOS_IOPRIO,
};
struct rq_wait {
@@ -78,19 +80,6 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_LATENCY);
}
-static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
-{
- switch (id) {
- case RQ_QOS_WBT:
- return "wbt";
- case RQ_QOS_LATENCY:
- return "latency";
- case RQ_QOS_COST:
- return "cost";
- }
- return "unknown";
-}
-
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
@@ -99,8 +88,21 @@ static inline void rq_wait_init(struct rq_wait *rq_wait)
static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
{
+ /*
+ * No IO can be in-flight when adding rqos, so freeze queue, which
+ * is fine since we only support rq_qos for blk-mq queue.
+ *
+ * Reuse ->queue_lock for protecting against other concurrent
+ * rq_qos adding/deleting
+ */
+ blk_mq_freeze_queue(q);
+
+ spin_lock_irq(&q->queue_lock);
rqos->next = q->rq_qos;
q->rq_qos = rqos;
+ spin_unlock_irq(&q->queue_lock);
+
+ blk_mq_unfreeze_queue(q);
if (rqos->ops->debugfs_attrs)
blk_mq_debugfs_register_rqos(rqos);
@@ -110,12 +112,22 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{
struct rq_qos **cur;
+ /*
+ * See comment in rq_qos_add() about freezing queue & using
+ * ->queue_lock.
+ */
+ blk_mq_freeze_queue(q);
+
+ spin_lock_irq(&q->queue_lock);
for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
if (*cur == rqos) {
*cur = rqos->next;
break;
}
}
+ spin_unlock_irq(&q->queue_lock);
+
+ blk_mq_unfreeze_queue(q);
blk_mq_debugfs_unregister_rqos(rqos);
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e03bedf180ab..370d83c18057 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -91,7 +91,7 @@ static ssize_t queue_ra_show(struct request_queue *q, char *page)
unsigned long ra_kb = q->backing_dev_info->ra_pages <<
(PAGE_SHIFT - 10);
- return queue_var_show(ra_kb, (page));
+ return queue_var_show(ra_kb, page);
}
static ssize_t
@@ -112,28 +112,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
{
int max_sectors_kb = queue_max_sectors(q) >> 1;
- return queue_var_show(max_sectors_kb, (page));
+ return queue_var_show(max_sectors_kb, page);
}
static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_segments(q), (page));
+ return queue_var_show(queue_max_segments(q), page);
}
static ssize_t queue_max_discard_segments_show(struct request_queue *q,
char *page)
{
- return queue_var_show(queue_max_discard_segments(q), (page));
+ return queue_var_show(queue_max_discard_segments(q), page);
}
static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->limits.max_integrity_segments, (page));
+ return queue_var_show(q->limits.max_integrity_segments, page);
}
static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_segment_size(q), (page));
+ return queue_var_show(queue_max_segment_size(q), page);
}
static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
@@ -261,12 +261,12 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
{
int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
- return queue_var_show(max_hw_sectors_kb, (page));
+ return queue_var_show(max_hw_sectors_kb, page);
}
static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->limits.virt_boundary_mask, (page));
+ return queue_var_show(q->limits.virt_boundary_mask, page);
}
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
@@ -866,20 +866,6 @@ int blk_register_queue(struct gendisk *disk)
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
- /*
- * SCSI probing may synchronously create and destroy a lot of
- * request_queues for non-existent devices. Shutting down a fully
- * functional queue takes measureable wallclock time as RCU grace
- * periods are involved. To avoid excessive latency in these
- * cases, a request_queue starts out in a degraded mode which is
- * faster to shut down and is made fully functional here as
- * request_queues for non-existent devices never get registered.
- */
- if (!blk_queue_init_done(q)) {
- blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
- percpu_ref_switch_to_percpu(&q->q_usage_counter);
- }
-
blk_queue_update_readahead(q);
ret = blk_trace_init_sysfs(dev);
@@ -938,6 +924,21 @@ int blk_register_queue(struct gendisk *disk)
ret = 0;
unlock:
mutex_unlock(&q->sysfs_dir_lock);
+
+ /*
+ * SCSI probing may synchronously create and destroy a lot of
+ * request_queues for non-existent devices. Shutting down a fully
+ * functional queue takes measureable wallclock time as RCU grace
+ * periods are involved. To avoid excessive latency in these
+ * cases, a request_queue starts out in a degraded mode which is
+ * faster to shut down and is made fully functional here as
+ * request_queues for non-existent devices never get registered.
+ */
+ if (!blk_queue_init_done(q)) {
+ blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q);
+ percpu_ref_switch_to_percpu(&q->q_usage_counter);
+ }
+
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 42aed0160f86..3ed71b8da887 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -77,7 +77,8 @@ enum {
static inline bool rwb_enabled(struct rq_wb *rwb)
{
- return rwb && rwb->wb_normal != 0;
+ return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
+ rwb->wb_normal != 0;
}
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
@@ -563,7 +564,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
}
/*
- * Returns true if the IO request should be accounted, false if not.
* May sleep, if we have exceeded the writeback limits. Caller can pass
* in an irq held spinlock, if it holds one when calling this function.
* If we do sleep, we'll release and re-grab it.
@@ -636,9 +636,13 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
void wbt_enable_default(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
+
/* Throttling already enabled? */
- if (rqos)
+ if (rqos) {
+ if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
+ }
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
@@ -702,7 +706,7 @@ void wbt_disable_default(struct request_queue *q)
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
- rwb->wb_normal = 0;
+ rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 16bdc85b8df9..2eb01becde8c 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -34,6 +34,7 @@ enum {
enum {
WBT_STATE_ON_DEFAULT = 1,
WBT_STATE_ON_MANUAL = 2,
+ WBT_STATE_OFF_DEFAULT
};
struct rq_wb {
diff --git a/block/blk.h b/block/blk.h
index 8b3591aee0a5..4b885c0f6708 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -192,7 +192,6 @@ void blk_account_io_done(struct request *req, u64 now);
void blk_insert_flush(struct request *rq);
-void elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
@@ -225,7 +224,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
@@ -343,8 +342,8 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
static inline void blk_queue_clear_zone_settings(struct request_queue *q) {}
#endif
-int blk_alloc_devt(struct block_device *part, dev_t *devt);
-void blk_free_devt(dev_t devt);
+int blk_alloc_ext_minor(void);
+void blk_free_ext_minor(unsigned int minor);
char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
@@ -359,4 +358,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
+struct request_queue *blk_alloc_queue(int node_id);
+
+void disk_alloc_events(struct gendisk *disk);
+void disk_add_events(struct gendisk *disk);
+void disk_del_events(struct gendisk *disk);
+void disk_release_events(struct gendisk *disk);
+extern struct device_attribute dev_attr_events;
+extern struct device_attribute dev_attr_events_async;
+extern struct device_attribute dev_attr_events_poll_msecs;
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/disk-events.c b/block/disk-events.c
new file mode 100644
index 000000000000..a75931ff5da4
--- /dev/null
+++ b/block/disk-events.c
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/genhd.h>
+#include "blk.h"
+
+struct disk_events {
+ struct list_head node; /* all disk_event's */
+ struct gendisk *disk; /* the associated disk */
+ spinlock_t lock;
+
+ struct mutex block_mutex; /* protects blocking */
+ int block; /* event blocking depth */
+ unsigned int pending; /* events already sent out */
+ unsigned int clearing; /* events being cleared */
+
+ long poll_msecs; /* interval, -1 for default */
+ struct delayed_work dwork;
+};
+
+static const char *disk_events_strs[] = {
+ [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
+ [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
+};
+
+static char *disk_uevents[] = {
+ [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
+ [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
+};
+
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs;
+
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+ struct disk_events *ev = disk->ev;
+ long intv_msecs = 0;
+
+ /*
+ * If device-specific poll interval is set, always use it. If
+ * the default is being used, poll if the POLL flag is set.
+ */
+ if (ev->poll_msecs >= 0)
+ intv_msecs = ev->poll_msecs;
+ else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
+ intv_msecs = disk_events_dfl_poll_msecs;
+
+ return msecs_to_jiffies(intv_msecs);
+}
+
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events(). Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned long flags;
+ bool cancel;
+
+ if (!ev)
+ return;
+
+ /*
+ * Outer mutex ensures that the first blocker completes canceling
+ * the event work before further blockers are allowed to finish.
+ */
+ mutex_lock(&ev->block_mutex);
+
+ spin_lock_irqsave(&ev->lock, flags);
+ cancel = !ev->block++;
+ spin_unlock_irqrestore(&ev->lock, flags);
+
+ if (cancel)
+ cancel_delayed_work_sync(&disk->ev->dwork);
+
+ mutex_unlock(&ev->block_mutex);
+}
+
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned long intv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ev->lock, flags);
+
+ if (WARN_ON_ONCE(ev->block <= 0))
+ goto out_unlock;
+
+ if (--ev->block)
+ goto out_unlock;
+
+ intv = disk_events_poll_jiffies(disk);
+ if (check_now)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
+ else if (intv)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
+out_unlock:
+ spin_unlock_irqrestore(&ev->lock, flags);
+}
+
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events(). When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care. Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+ if (disk->ev)
+ __disk_unblock_events(disk, false);
+}
+
+/**
+ * disk_flush_events - schedule immediate event checking and flushing
+ * @disk: disk to check and flush events for
+ * @mask: events to flush
+ *
+ * Schedule immediate event checking on @disk if not blocked. Events in
+ * @mask are scheduled to be cleared from the driver. Note that this
+ * doesn't clear the events from @disk->ev.
+ *
+ * CONTEXT:
+ * If @mask is non-zero must be called with disk->open_mutex held.
+ */
+void disk_flush_events(struct gendisk *disk, unsigned int mask)
+{
+ struct disk_events *ev = disk->ev;
+
+ if (!ev)
+ return;
+
+ spin_lock_irq(&ev->lock);
+ ev->clearing |= mask;
+ if (!ev->block)
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
+ spin_unlock_irq(&ev->lock);
+}
+
+static void disk_check_events(struct disk_events *ev,
+ unsigned int *clearing_ptr)
+{
+ struct gendisk *disk = ev->disk;
+ char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+ unsigned int clearing = *clearing_ptr;
+ unsigned int events;
+ unsigned long intv;
+ int nr_events = 0, i;
+
+ /* check events */
+ events = disk->fops->check_events(disk, clearing);
+
+ /* accumulate pending events and schedule next poll if necessary */
+ spin_lock_irq(&ev->lock);
+
+ events &= ~ev->pending;
+ ev->pending |= events;
+ *clearing_ptr &= ~clearing;
+
+ intv = disk_events_poll_jiffies(disk);
+ if (!ev->block && intv)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
+
+ spin_unlock_irq(&ev->lock);
+
+ /*
+ * Tell userland about new events. Only the events listed in
+ * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
+ * is set. Otherwise, events are processed internally but never
+ * get reported to userland.
+ */
+ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+ if ((events & disk->events & (1 << i)) &&
+ (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
+ envp[nr_events++] = disk_uevents[i];
+
+ if (nr_events)
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and cleared
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned. This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned int pending;
+ unsigned int clearing = mask;
+
+ if (!ev)
+ return 0;
+
+ disk_block_events(disk);
+
+ /*
+ * store the union of mask and ev->clearing on the stack so that the
+ * race with disk_flush_events does not cause ambiguity (ev->clearing
+ * can still be modified even if events are blocked).
+ */
+ spin_lock_irq(&ev->lock);
+ clearing |= ev->clearing;
+ ev->clearing = 0;
+ spin_unlock_irq(&ev->lock);
+
+ disk_check_events(ev, &clearing);
+ /*
+ * if ev->clearing is not 0, the disk_flush_events got called in the
+ * middle of this function, so we want to run the workfn without delay.
+ */
+ __disk_unblock_events(disk, ev->clearing ? true : false);
+
+ /* then, fetch and clear pending events */
+ spin_lock_irq(&ev->lock);
+ pending = ev->pending & mask;
+ ev->pending &= ~mask;
+ spin_unlock_irq(&ev->lock);
+ WARN_ON_ONCE(clearing & mask);
+
+ return pending;
+}
+
+/**
+ * bdev_check_media_change - check if a removable media has been changed
+ * @bdev: block device to check
+ *
+ * Check whether a removable media has been changed, and attempt to free all
+ * dentries and inodes and invalidates all block device page cache entries in
+ * that case.
+ *
+ * Returns %true if the block device changed, or %false if not.
+ */
+bool bdev_check_media_change(struct block_device *bdev)
+{
+ unsigned int events;
+
+ events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
+ DISK_EVENT_EJECT_REQUEST);
+ if (!(events & DISK_EVENT_MEDIA_CHANGE))
+ return false;
+
+ if (__invalidate_device(bdev, true))
+ pr_warn("VFS: busy inodes on changed media %s\n",
+ bdev->bd_disk->disk_name);
+ set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+ return true;
+}
+EXPORT_SYMBOL(bdev_check_media_change);
+
+/*
+ * Separate this part out so that a different pointer for clearing_ptr can be
+ * passed in for disk_clear_events.
+ */
+static void disk_events_workfn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+
+ disk_check_events(ev, &ev->clearing);
+}
+
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events : list of all supported events
+ * events_async : list of events which can be detected w/o polling
+ * (always empty, only for backwards compatibility)
+ * events_poll_msecs : polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+ const char *delim = "";
+ ssize_t pos = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+ if (events & (1 << i)) {
+ pos += sprintf(buf + pos, "%s%s",
+ delim, disk_events_strs[i]);
+ delim = " ";
+ }
+ if (pos)
+ pos += sprintf(buf + pos, "\n");
+ return pos;
+}
+
+static ssize_t disk_events_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
+ return 0;
+ return __disk_events_show(disk->events, buf);
+}
+
+static ssize_t disk_events_async_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return 0;
+}
+
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->ev)
+ return sprintf(buf, "-1\n");
+ return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ long intv;
+
+ if (!count || !sscanf(buf, "%ld", &intv))
+ return -EINVAL;
+
+ if (intv < 0 && intv != -1)
+ return -EINVAL;
+
+ if (!disk->ev)
+ return -ENODEV;
+
+ disk_block_events(disk);
+ disk->ev->poll_msecs = intv;
+ __disk_unblock_events(disk, true);
+ return count;
+}
+
+DEVICE_ATTR(events, 0444, disk_events_show, NULL);
+DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
+DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
+ disk_events_poll_msecs_store);
+
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable). This can also be modified runtime by writing to
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+ const struct kernel_param *kp)
+{
+ struct disk_events *ev;
+ int ret;
+
+ ret = param_set_ulong(val, kp);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&disk_events_mutex);
+ list_for_each_entry(ev, &disk_events, node)
+ disk_flush_events(ev->disk, 0);
+ mutex_unlock(&disk_events_mutex);
+ return 0;
+}
+
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+ .set = disk_events_set_dfl_poll_msecs,
+ .get = param_get_ulong,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "block."
+
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+ &disk_events_dfl_poll_msecs, 0644);
+
+/*
+ * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
+ */
+void disk_alloc_events(struct gendisk *disk)
+{
+ struct disk_events *ev;
+
+ if (!disk->fops->check_events || !disk->events)
+ return;
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev) {
+ pr_warn("%s: failed to initialize events\n", disk->disk_name);
+ return;
+ }
+
+ INIT_LIST_HEAD(&ev->node);
+ ev->disk = disk;
+ spin_lock_init(&ev->lock);
+ mutex_init(&ev->block_mutex);
+ ev->block = 1;
+ ev->poll_msecs = -1;
+ INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+
+ disk->ev = ev;
+}
+
+void disk_add_events(struct gendisk *disk)
+{
+ if (!disk->ev)
+ return;
+
+ mutex_lock(&disk_events_mutex);
+ list_add_tail(&disk->ev->node, &disk_events);
+ mutex_unlock(&disk_events_mutex);
+
+ /*
+ * Block count is initialized to 1 and the following initial
+ * unblock kicks it into action.
+ */
+ __disk_unblock_events(disk, true);
+}
+
+void disk_del_events(struct gendisk *disk)
+{
+ if (disk->ev) {
+ disk_block_events(disk);
+
+ mutex_lock(&disk_events_mutex);
+ list_del_init(&disk->ev->node);
+ mutex_unlock(&disk_events_mutex);
+ }
+}
+
+void disk_release_events(struct gendisk *disk)
+{
+ /* the block count should be 1 from disk_del_events() */
+ WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+ kfree(disk->ev);
+}
diff --git a/block/elevator.c b/block/elevator.c
index 440699c28119..52ada14cfe45 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -350,9 +350,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
* we can append 'rq' to an existing request, so we can throw 'rq' away
* afterwards.
*
- * Returns true if we merged, false otherwise
+ * Returns true if we merged, false otherwise. 'free' will contain all
+ * requests that need to be freed.
*/
-bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free)
{
struct request *__rq;
bool ret;
@@ -363,8 +365,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
/*
* First try one-hit cache.
*/
- if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
+ list_add(&rq->queuelist, free);
return true;
+ }
if (blk_queue_noxmerges(q))
return false;
@@ -378,6 +382,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
break;
+ list_add(&rq->queuelist, free);
/* The merged request could be merged with others, try again */
ret = true;
rq = __rq;
@@ -522,6 +527,10 @@ void elv_unregister_queue(struct request_queue *q)
int elv_register(struct elevator_type *e)
{
+ /* insert_requests and dispatch_request are mandatory */
+ if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request))
+ return -EINVAL;
+
/* create icq_cache if requested */
if (e->icq_size) {
if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
@@ -693,7 +702,7 @@ void elevator_init_mq(struct request_queue *q)
elevator_put(e);
}
}
-
+EXPORT_SYMBOL_GPL(elevator_init_mq); /* only for dm-rq */
/*
* switch to new_e io scheduler. be careful not to introduce deadlocks -
diff --git a/block/genhd.c b/block/genhd.c
index 9f8cb7beaad1..79aa40b4c39c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -33,13 +33,6 @@ static struct kobject *block_depr;
#define NR_EXT_DEVT (1 << MINORBITS)
static DEFINE_IDA(ext_devt_ida);
-static void disk_check_events(struct disk_events *ev,
- unsigned int *clearing_ptr);
-static void disk_alloc_events(struct gendisk *disk);
-static void disk_add_events(struct gendisk *disk);
-static void disk_del_events(struct gendisk *disk);
-static void disk_release_events(struct gendisk *disk);
-
void set_capacity(struct gendisk *disk, sector_t sectors)
{
struct block_device *bdev = disk->part0;
@@ -333,52 +326,22 @@ static int blk_mangle_minor(int minor)
return minor;
}
-/**
- * blk_alloc_devt - allocate a dev_t for a block device
- * @bdev: block device to allocate dev_t for
- * @devt: out parameter for resulting dev_t
- *
- * Allocate a dev_t for block device.
- *
- * RETURNS:
- * 0 on success, allocated dev_t is returned in *@devt. -errno on
- * failure.
- *
- * CONTEXT:
- * Might sleep.
- */
-int blk_alloc_devt(struct block_device *bdev, dev_t *devt)
+int blk_alloc_ext_minor(void)
{
- struct gendisk *disk = bdev->bd_disk;
int idx;
- /* in consecutive minor range? */
- if (bdev->bd_partno < disk->minors) {
- *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
- return 0;
- }
-
idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL);
- if (idx < 0)
- return idx == -ENOSPC ? -EBUSY : idx;
-
- *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
- return 0;
+ if (idx < 0) {
+ if (idx == -ENOSPC)
+ return -EBUSY;
+ return idx;
+ }
+ return blk_mangle_minor(idx);
}
-/**
- * blk_free_devt - free a dev_t
- * @devt: dev_t to free
- *
- * Free @devt which was allocated using blk_alloc_devt().
- *
- * CONTEXT:
- * Might sleep.
- */
-void blk_free_devt(dev_t devt)
+void blk_free_ext_minor(unsigned int minor)
{
- if (MAJOR(devt) == BLOCK_EXT_MAJOR)
- ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt)));
+ ida_free(&ext_devt_ida, blk_mangle_minor(minor));
}
static char *bdevt_str(dev_t devt, char *buf)
@@ -499,8 +462,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups,
bool register_queue)
{
- dev_t devt;
- int retval;
+ int ret;
/*
* The disk queue should now be all set with enough information about
@@ -511,23 +473,35 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
if (register_queue)
elevator_init_mq(disk->queue);
- /* minors == 0 indicates to use ext devt from part0 and should
- * be accompanied with EXT_DEVT flag. Make sure all
- * parameters make sense.
+ /*
+ * If the driver provides an explicit major number it also must provide
+ * the number of minors numbers supported, and those will be used to
+ * setup the gendisk.
+ * Otherwise just allocate the device numbers for both the whole device
+ * and all partitions from the extended dev_t space.
*/
- WARN_ON(disk->minors && !(disk->major || disk->first_minor));
- WARN_ON(!disk->minors &&
- !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
+ if (disk->major) {
+ WARN_ON(!disk->minors);
- disk->flags |= GENHD_FL_UP;
+ if (disk->minors > DISK_MAX_PARTS) {
+ pr_err("block: can't allocate more than %d partitions\n",
+ DISK_MAX_PARTS);
+ disk->minors = DISK_MAX_PARTS;
+ }
+ } else {
+ WARN_ON(disk->minors);
- retval = blk_alloc_devt(disk->part0, &devt);
- if (retval) {
- WARN_ON(1);
- return;
+ ret = blk_alloc_ext_minor();
+ if (ret < 0) {
+ WARN_ON(1);
+ return;
+ }
+ disk->major = BLOCK_EXT_MAJOR;
+ disk->first_minor = MINOR(ret);
+ disk->flags |= GENHD_FL_EXT_DEVT;
}
- disk->major = MAJOR(devt);
- disk->first_minor = MINOR(devt);
+
+ disk->flags |= GENHD_FL_UP;
disk_alloc_events(disk);
@@ -541,14 +515,14 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
} else {
struct backing_dev_info *bdi = disk->queue->backing_dev_info;
struct device *dev = disk_to_dev(disk);
- int ret;
/* Register BDI before referencing it from bdev */
- dev->devt = devt;
- ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
+ dev->devt = MKDEV(disk->major, disk->first_minor);
+ ret = bdi_register(bdi, "%u:%u",
+ disk->major, disk->first_minor);
WARN_ON(ret);
bdi_set_owner(bdi, dev);
- bdev_add(disk->part0, devt);
+ bdev_add(disk->part0, dev->devt);
}
register_disk(parent, disk, groups);
if (register_queue)
@@ -558,7 +532,10 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
* Take an extra ref on queue which will be put on disk_release()
* so that it sticks around as long as @disk is there.
*/
- WARN_ON_ONCE(!blk_get_queue(disk->queue));
+ if (blk_get_queue(disk->queue))
+ set_bit(GD_QUEUE_REF, &disk->state);
+ else
+ WARN_ON_ONCE(1);
disk_add_events(disk);
blk_integrity_add(disk);
@@ -607,10 +584,10 @@ void del_gendisk(struct gendisk *disk)
blk_integrity_del(disk);
disk_del_events(disk);
- mutex_lock(&disk->part0->bd_mutex);
+ mutex_lock(&disk->open_mutex);
disk->flags &= ~GENHD_FL_UP;
blk_drop_partitions(disk);
- mutex_unlock(&disk->part0->bd_mutex);
+ mutex_unlock(&disk->open_mutex);
fsync_bdev(disk->part0);
__invalidate_device(disk->part0, true);
@@ -692,32 +669,6 @@ void blk_request_module(dev_t devt)
request_module("block-major-%d", MAJOR(devt));
}
-/**
- * bdget_disk - do bdget() by gendisk and partition number
- * @disk: gendisk of interest
- * @partno: partition number
- *
- * Find partition @partno from @disk, do bdget() on it.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * Resulting block_device on success, NULL on failure.
- */
-struct block_device *bdget_disk(struct gendisk *disk, int partno)
-{
- struct block_device *bdev = NULL;
-
- rcu_read_lock();
- bdev = xa_load(&disk->part_tbl, partno);
- if (bdev && !bdgrab(bdev))
- bdev = NULL;
- rcu_read_unlock();
-
- return bdev;
-}
-
/*
* print a full list of all partitions - intended for places where the root
* filesystem can't be mounted and thus to give the victim some idea of what
@@ -1071,6 +1022,9 @@ static struct attribute *disk_attrs[] = {
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_badblocks.attr,
+ &dev_attr_events.attr,
+ &dev_attr_events_async.attr,
+ &dev_attr_events_poll_msecs.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1120,12 +1074,13 @@ static void disk_release(struct device *dev)
might_sleep();
- blk_free_devt(dev->devt);
+ if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(MINOR(dev->devt));
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
bdput(disk->part0);
- if (disk->queue)
+ if (test_bit(GD_QUEUE_REF, &disk->state) && disk->queue)
blk_put_queue(disk->queue);
kfree(disk);
}
@@ -1242,6 +1197,20 @@ static int __init proc_genhd_init(void)
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */
+dev_t part_devt(struct gendisk *disk, u8 partno)
+{
+ struct block_device *part;
+ dev_t devt = 0;
+
+ rcu_read_lock();
+ part = xa_load(&disk->part_tbl, partno);
+ if (part)
+ devt = part->bd_dev;
+ rcu_read_unlock();
+
+ return devt;
+}
+
dev_t blk_lookup_devt(const char *name, int partno)
{
dev_t devt = MKDEV(0, 0);
@@ -1251,7 +1220,6 @@ dev_t blk_lookup_devt(const char *name, int partno)
class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
while ((dev = class_dev_iter_next(&iter))) {
struct gendisk *disk = dev_to_disk(dev);
- struct block_device *part;
if (strcmp(dev_name(dev), name))
continue;
@@ -1262,13 +1230,10 @@ dev_t blk_lookup_devt(const char *name, int partno)
*/
devt = MKDEV(MAJOR(dev->devt),
MINOR(dev->devt) + partno);
- break;
- }
- part = bdget_disk(disk, partno);
- if (part) {
- devt = part->bd_dev;
- bdput(part);
- break;
+ } else {
+ devt = part_devt(disk, partno);
+ if (devt)
+ break;
}
}
class_dev_iter_exit(&iter);
@@ -1279,13 +1244,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
{
struct gendisk *disk;
- if (minors > DISK_MAX_PARTS) {
- printk(KERN_ERR
- "block: can't allocate more than %d partitions\n",
- DISK_MAX_PARTS);
- minors = DISK_MAX_PARTS;
- }
-
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (!disk)
return NULL;
@@ -1295,6 +1253,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
goto out_free_disk;
disk->node_id = node_id;
+ mutex_init(&disk->open_mutex);
xa_init(&disk->part_tbl);
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
@@ -1315,6 +1274,25 @@ out_free_disk:
}
EXPORT_SYMBOL(__alloc_disk_node);
+struct gendisk *__blk_alloc_disk(int node)
+{
+ struct request_queue *q;
+ struct gendisk *disk;
+
+ q = blk_alloc_queue(node);
+ if (!q)
+ return NULL;
+
+ disk = __alloc_disk_node(0, node);
+ if (!disk) {
+ blk_cleanup_queue(q);
+ return NULL;
+ }
+ disk->queue = q;
+ return disk;
+}
+EXPORT_SYMBOL(__blk_alloc_disk);
+
/**
* put_disk - decrements the gendisk refcount
* @disk: the struct gendisk to decrement the refcount for
@@ -1332,6 +1310,22 @@ void put_disk(struct gendisk *disk)
}
EXPORT_SYMBOL(put_disk);
+/**
+ * blk_cleanup_disk - shutdown a gendisk allocated by blk_alloc_disk
+ * @disk: gendisk to shutdown
+ *
+ * Mark the queue hanging off @disk DYING, drain all pending requests, then mark
+ * the queue DEAD, destroy and put it and the gendisk structure.
+ *
+ * Context: can sleep
+ */
+void blk_cleanup_disk(struct gendisk *disk)
+{
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+EXPORT_SYMBOL(blk_cleanup_disk);
+
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
char event[] = "DISK_RO=1";
@@ -1369,488 +1363,3 @@ int bdev_read_only(struct block_device *bdev)
return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
}
EXPORT_SYMBOL(bdev_read_only);
-
-/*
- * Disk events - monitor disk events like media change and eject request.
- */
-struct disk_events {
- struct list_head node; /* all disk_event's */
- struct gendisk *disk; /* the associated disk */
- spinlock_t lock;
-
- struct mutex block_mutex; /* protects blocking */
- int block; /* event blocking depth */
- unsigned int pending; /* events already sent out */
- unsigned int clearing; /* events being cleared */
-
- long poll_msecs; /* interval, -1 for default */
- struct delayed_work dwork;
-};
-
-static const char *disk_events_strs[] = {
- [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
- [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
-};
-
-static char *disk_uevents[] = {
- [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
- [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
-};
-
-/* list of all disk_events */
-static DEFINE_MUTEX(disk_events_mutex);
-static LIST_HEAD(disk_events);
-
-/* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs;
-
-static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
-{
- struct disk_events *ev = disk->ev;
- long intv_msecs = 0;
-
- /*
- * If device-specific poll interval is set, always use it. If
- * the default is being used, poll if the POLL flag is set.
- */
- if (ev->poll_msecs >= 0)
- intv_msecs = ev->poll_msecs;
- else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
- intv_msecs = disk_events_dfl_poll_msecs;
-
- return msecs_to_jiffies(intv_msecs);
-}
-
-/**
- * disk_block_events - block and flush disk event checking
- * @disk: disk to block events for
- *
- * On return from this function, it is guaranteed that event checking
- * isn't in progress and won't happen until unblocked by
- * disk_unblock_events(). Events blocking is counted and the actual
- * unblocking happens after the matching number of unblocks are done.
- *
- * Note that this intentionally does not block event checking from
- * disk_clear_events().
- *
- * CONTEXT:
- * Might sleep.
- */
-void disk_block_events(struct gendisk *disk)
-{
- struct disk_events *ev = disk->ev;
- unsigned long flags;
- bool cancel;
-
- if (!ev)
- return;
-
- /*
- * Outer mutex ensures that the first blocker completes canceling
- * the event work before further blockers are allowed to finish.
- */
- mutex_lock(&ev->block_mutex);
-
- spin_lock_irqsave(&ev->lock, flags);
- cancel = !ev->block++;
- spin_unlock_irqrestore(&ev->lock, flags);
-
- if (cancel)
- cancel_delayed_work_sync(&disk->ev->dwork);
-
- mutex_unlock(&ev->block_mutex);
-}
-
-static void __disk_unblock_events(struct gendisk *disk, bool check_now)
-{
- struct disk_events *ev = disk->ev;
- unsigned long intv;
- unsigned long flags;
-
- spin_lock_irqsave(&ev->lock, flags);
-
- if (WARN_ON_ONCE(ev->block <= 0))
- goto out_unlock;
-
- if (--ev->block)
- goto out_unlock;
-
- intv = disk_events_poll_jiffies(disk);
- if (check_now)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, 0);
- else if (intv)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, intv);
-out_unlock:
- spin_unlock_irqrestore(&ev->lock, flags);
-}
-
-/**
- * disk_unblock_events - unblock disk event checking
- * @disk: disk to unblock events for
- *
- * Undo disk_block_events(). When the block count reaches zero, it
- * starts events polling if configured.
- *
- * CONTEXT:
- * Don't care. Safe to call from irq context.
- */
-void disk_unblock_events(struct gendisk *disk)
-{
- if (disk->ev)
- __disk_unblock_events(disk, false);
-}
-
-/**
- * disk_flush_events - schedule immediate event checking and flushing
- * @disk: disk to check and flush events for
- * @mask: events to flush
- *
- * Schedule immediate event checking on @disk if not blocked. Events in
- * @mask are scheduled to be cleared from the driver. Note that this
- * doesn't clear the events from @disk->ev.
- *
- * CONTEXT:
- * If @mask is non-zero must be called with bdev->bd_mutex held.
- */
-void disk_flush_events(struct gendisk *disk, unsigned int mask)
-{
- struct disk_events *ev = disk->ev;
-
- if (!ev)
- return;
-
- spin_lock_irq(&ev->lock);
- ev->clearing |= mask;
- if (!ev->block)
- mod_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, 0);
- spin_unlock_irq(&ev->lock);
-}
-
-/**
- * disk_clear_events - synchronously check, clear and return pending events
- * @disk: disk to fetch and clear events from
- * @mask: mask of events to be fetched and cleared
- *
- * Disk events are synchronously checked and pending events in @mask
- * are cleared and returned. This ignores the block count.
- *
- * CONTEXT:
- * Might sleep.
- */
-static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
-{
- struct disk_events *ev = disk->ev;
- unsigned int pending;
- unsigned int clearing = mask;
-
- if (!ev)
- return 0;
-
- disk_block_events(disk);
-
- /*
- * store the union of mask and ev->clearing on the stack so that the
- * race with disk_flush_events does not cause ambiguity (ev->clearing
- * can still be modified even if events are blocked).
- */
- spin_lock_irq(&ev->lock);
- clearing |= ev->clearing;
- ev->clearing = 0;
- spin_unlock_irq(&ev->lock);
-
- disk_check_events(ev, &clearing);
- /*
- * if ev->clearing is not 0, the disk_flush_events got called in the
- * middle of this function, so we want to run the workfn without delay.
- */
- __disk_unblock_events(disk, ev->clearing ? true : false);
-
- /* then, fetch and clear pending events */
- spin_lock_irq(&ev->lock);
- pending = ev->pending & mask;
- ev->pending &= ~mask;
- spin_unlock_irq(&ev->lock);
- WARN_ON_ONCE(clearing & mask);
-
- return pending;
-}
-
-/**
- * bdev_check_media_change - check if a removable media has been changed
- * @bdev: block device to check
- *
- * Check whether a removable media has been changed, and attempt to free all
- * dentries and inodes and invalidates all block device page cache entries in
- * that case.
- *
- * Returns %true if the block device changed, or %false if not.
- */
-bool bdev_check_media_change(struct block_device *bdev)
-{
- unsigned int events;
-
- events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
- DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return false;
-
- if (__invalidate_device(bdev, true))
- pr_warn("VFS: busy inodes on changed media %s\n",
- bdev->bd_disk->disk_name);
- set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
- return true;
-}
-EXPORT_SYMBOL(bdev_check_media_change);
-
-/*
- * Separate this part out so that a different pointer for clearing_ptr can be
- * passed in for disk_clear_events.
- */
-static void disk_events_workfn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
-
- disk_check_events(ev, &ev->clearing);
-}
-
-static void disk_check_events(struct disk_events *ev,
- unsigned int *clearing_ptr)
-{
- struct gendisk *disk = ev->disk;
- char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
- unsigned int clearing = *clearing_ptr;
- unsigned int events;
- unsigned long intv;
- int nr_events = 0, i;
-
- /* check events */
- events = disk->fops->check_events(disk, clearing);
-
- /* accumulate pending events and schedule next poll if necessary */
- spin_lock_irq(&ev->lock);
-
- events &= ~ev->pending;
- ev->pending |= events;
- *clearing_ptr &= ~clearing;
-
- intv = disk_events_poll_jiffies(disk);
- if (!ev->block && intv)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, intv);
-
- spin_unlock_irq(&ev->lock);
-
- /*
- * Tell userland about new events. Only the events listed in
- * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
- * is set. Otherwise, events are processed internally but never
- * get reported to userland.
- */
- for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
- if ((events & disk->events & (1 << i)) &&
- (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
- envp[nr_events++] = disk_uevents[i];
-
- if (nr_events)
- kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
-}
-
-/*
- * A disk events enabled device has the following sysfs nodes under
- * its /sys/block/X/ directory.
- *
- * events : list of all supported events
- * events_async : list of events which can be detected w/o polling
- * (always empty, only for backwards compatibility)
- * events_poll_msecs : polling interval, 0: disable, -1: system default
- */
-static ssize_t __disk_events_show(unsigned int events, char *buf)
-{
- const char *delim = "";
- ssize_t pos = 0;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
- if (events & (1 << i)) {
- pos += sprintf(buf + pos, "%s%s",
- delim, disk_events_strs[i]);
- delim = " ";
- }
- if (pos)
- pos += sprintf(buf + pos, "\n");
- return pos;
-}
-
-static ssize_t disk_events_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
- return 0;
-
- return __disk_events_show(disk->events, buf);
-}
-
-static ssize_t disk_events_async_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return 0;
-}
-
-static ssize_t disk_events_poll_msecs_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (!disk->ev)
- return sprintf(buf, "-1\n");
-
- return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
-}
-
-static ssize_t disk_events_poll_msecs_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct gendisk *disk = dev_to_disk(dev);
- long intv;
-
- if (!count || !sscanf(buf, "%ld", &intv))
- return -EINVAL;
-
- if (intv < 0 && intv != -1)
- return -EINVAL;
-
- if (!disk->ev)
- return -ENODEV;
-
- disk_block_events(disk);
- disk->ev->poll_msecs = intv;
- __disk_unblock_events(disk, true);
-
- return count;
-}
-
-static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
-static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
-static const DEVICE_ATTR(events_poll_msecs, 0644,
- disk_events_poll_msecs_show,
- disk_events_poll_msecs_store);
-
-static const struct attribute *disk_events_attrs[] = {
- &dev_attr_events.attr,
- &dev_attr_events_async.attr,
- &dev_attr_events_poll_msecs.attr,
- NULL,
-};
-
-/*
- * The default polling interval can be specified by the kernel
- * parameter block.events_dfl_poll_msecs which defaults to 0
- * (disable). This can also be modified runtime by writing to
- * /sys/module/block/parameters/events_dfl_poll_msecs.
- */
-static int disk_events_set_dfl_poll_msecs(const char *val,
- const struct kernel_param *kp)
-{
- struct disk_events *ev;
- int ret;
-
- ret = param_set_ulong(val, kp);
- if (ret < 0)
- return ret;
-
- mutex_lock(&disk_events_mutex);
-
- list_for_each_entry(ev, &disk_events, node)
- disk_flush_events(ev->disk, 0);
-
- mutex_unlock(&disk_events_mutex);
-
- return 0;
-}
-
-static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
- .set = disk_events_set_dfl_poll_msecs,
- .get = param_get_ulong,
-};
-
-#undef MODULE_PARAM_PREFIX
-#define MODULE_PARAM_PREFIX "block."
-
-module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
- &disk_events_dfl_poll_msecs, 0644);
-
-/*
- * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
- */
-static void disk_alloc_events(struct gendisk *disk)
-{
- struct disk_events *ev;
-
- if (!disk->fops->check_events || !disk->events)
- return;
-
- ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev) {
- pr_warn("%s: failed to initialize events\n", disk->disk_name);
- return;
- }
-
- INIT_LIST_HEAD(&ev->node);
- ev->disk = disk;
- spin_lock_init(&ev->lock);
- mutex_init(&ev->block_mutex);
- ev->block = 1;
- ev->poll_msecs = -1;
- INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
-
- disk->ev = ev;
-}
-
-static void disk_add_events(struct gendisk *disk)
-{
- /* FIXME: error handling */
- if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
- pr_warn("%s: failed to create sysfs files for events\n",
- disk->disk_name);
-
- if (!disk->ev)
- return;
-
- mutex_lock(&disk_events_mutex);
- list_add_tail(&disk->ev->node, &disk_events);
- mutex_unlock(&disk_events_mutex);
-
- /*
- * Block count is initialized to 1 and the following initial
- * unblock kicks it into action.
- */
- __disk_unblock_events(disk, true);
-}
-
-static void disk_del_events(struct gendisk *disk)
-{
- if (disk->ev) {
- disk_block_events(disk);
-
- mutex_lock(&disk_events_mutex);
- list_del_init(&disk->ev->node);
- mutex_unlock(&disk_events_mutex);
- }
-
- sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
-}
-
-static void disk_release_events(struct gendisk *disk)
-{
- /* the block count should be 1 from disk_del_events() */
- WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
- kfree(disk->ev);
-}
diff --git a/block/ioctl.c b/block/ioctl.c
index 8ba1ed8defd0..24beec9ca9c9 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -89,7 +89,7 @@ static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- if (bdev->bd_part_count)
+ if (bdev->bd_disk->open_partitions)
return -EBUSY;
/*
diff --git a/block/mq-deadline-cgroup.c b/block/mq-deadline-cgroup.c
new file mode 100644
index 000000000000..3b4bfddec39f
--- /dev/null
+++ b/block/mq-deadline-cgroup.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/blk-cgroup.h>
+#include <linux/ioprio.h>
+
+#include "mq-deadline-cgroup.h"
+
+static struct blkcg_policy dd_blkcg_policy;
+
+static struct blkcg_policy_data *dd_cpd_alloc(gfp_t gfp)
+{
+ struct dd_blkcg *pd;
+
+ pd = kzalloc(sizeof(*pd), gfp);
+ if (!pd)
+ return NULL;
+ pd->stats = alloc_percpu_gfp(typeof(*pd->stats),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!pd->stats) {
+ kfree(pd);
+ return NULL;
+ }
+ return &pd->cpd;
+}
+
+static void dd_cpd_free(struct blkcg_policy_data *cpd)
+{
+ struct dd_blkcg *dd_blkcg = container_of(cpd, typeof(*dd_blkcg), cpd);
+
+ free_percpu(dd_blkcg->stats);
+ kfree(dd_blkcg);
+}
+
+static struct dd_blkcg *dd_blkcg_from_pd(struct blkg_policy_data *pd)
+{
+ return container_of(blkcg_to_cpd(pd->blkg->blkcg, &dd_blkcg_policy),
+ struct dd_blkcg, cpd);
+}
+
+/*
+ * Convert an association between a block cgroup and a request queue into a
+ * pointer to the mq-deadline information associated with a (blkcg, queue) pair.
+ */
+struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio)
+{
+ struct blkg_policy_data *pd;
+
+ pd = blkg_to_pd(bio->bi_blkg, &dd_blkcg_policy);
+ if (!pd)
+ return NULL;
+
+ return dd_blkcg_from_pd(pd);
+}
+
+static size_t dd_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
+{
+ static const char *const prio_class_name[] = {
+ [IOPRIO_CLASS_NONE] = "NONE",
+ [IOPRIO_CLASS_RT] = "RT",
+ [IOPRIO_CLASS_BE] = "BE",
+ [IOPRIO_CLASS_IDLE] = "IDLE",
+ };
+ struct dd_blkcg *blkcg = dd_blkcg_from_pd(pd);
+ int res = 0;
+ u8 prio;
+
+ for (prio = 0; prio < ARRAY_SIZE(blkcg->stats->stats); prio++)
+ res += scnprintf(buf + res, size - res,
+ " [%s] dispatched=%u inserted=%u merged=%u",
+ prio_class_name[prio],
+ ddcg_sum(blkcg, dispatched, prio) +
+ ddcg_sum(blkcg, merged, prio) -
+ ddcg_sum(blkcg, completed, prio),
+ ddcg_sum(blkcg, inserted, prio) -
+ ddcg_sum(blkcg, completed, prio),
+ ddcg_sum(blkcg, merged, prio));
+
+ return res;
+}
+
+static struct blkg_policy_data *dd_pd_alloc(gfp_t gfp, struct request_queue *q,
+ struct blkcg *blkcg)
+{
+ struct dd_blkg *pd;
+
+ pd = kzalloc(sizeof(*pd), gfp);
+ if (!pd)
+ return NULL;
+ return &pd->pd;
+}
+
+static void dd_pd_free(struct blkg_policy_data *pd)
+{
+ struct dd_blkg *dd_blkg = container_of(pd, typeof(*dd_blkg), pd);
+
+ kfree(dd_blkg);
+}
+
+static struct blkcg_policy dd_blkcg_policy = {
+ .cpd_alloc_fn = dd_cpd_alloc,
+ .cpd_free_fn = dd_cpd_free,
+
+ .pd_alloc_fn = dd_pd_alloc,
+ .pd_free_fn = dd_pd_free,
+ .pd_stat_fn = dd_pd_stat,
+};
+
+int dd_activate_policy(struct request_queue *q)
+{
+ return blkcg_activate_policy(q, &dd_blkcg_policy);
+}
+
+void dd_deactivate_policy(struct request_queue *q)
+{
+ blkcg_deactivate_policy(q, &dd_blkcg_policy);
+}
+
+int __init dd_blkcg_init(void)
+{
+ return blkcg_policy_register(&dd_blkcg_policy);
+}
+
+void __exit dd_blkcg_exit(void)
+{
+ blkcg_policy_unregister(&dd_blkcg_policy);
+}
diff --git a/block/mq-deadline-cgroup.h b/block/mq-deadline-cgroup.h
new file mode 100644
index 000000000000..0143fd74f3ce
--- /dev/null
+++ b/block/mq-deadline-cgroup.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#if !defined(_MQ_DEADLINE_CGROUP_H_)
+#define _MQ_DEADLINE_CGROUP_H_
+
+#include <linux/blk-cgroup.h>
+
+struct request_queue;
+
+/**
+ * struct io_stats_per_prio - I/O statistics per I/O priority class.
+ * @inserted: Number of inserted requests.
+ * @merged: Number of merged requests.
+ * @dispatched: Number of dispatched requests.
+ * @completed: Number of I/O completions.
+ */
+struct io_stats_per_prio {
+ local_t inserted;
+ local_t merged;
+ local_t dispatched;
+ local_t completed;
+};
+
+/* I/O statistics per I/O cgroup per I/O priority class (IOPRIO_CLASS_*). */
+struct blkcg_io_stats {
+ struct io_stats_per_prio stats[4];
+};
+
+/**
+ * struct dd_blkcg - Per cgroup data.
+ * @cpd: blkcg_policy_data structure.
+ * @stats: I/O statistics.
+ */
+struct dd_blkcg {
+ struct blkcg_policy_data cpd; /* must be the first member */
+ struct blkcg_io_stats __percpu *stats;
+};
+
+/*
+ * Count one event of type 'event_type' and with I/O priority class
+ * 'prio_class'.
+ */
+#define ddcg_count(ddcg, event_type, prio_class) do { \
+if (ddcg) { \
+ struct blkcg_io_stats *io_stats = get_cpu_ptr((ddcg)->stats); \
+ \
+ BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \
+ BUILD_BUG_ON(!__same_type((prio_class), u8)); \
+ local_inc(&io_stats->stats[(prio_class)].event_type); \
+ put_cpu_ptr(io_stats); \
+} \
+} while (0)
+
+/*
+ * Returns the total number of ddcg_count(ddcg, event_type, prio_class) calls
+ * across all CPUs. No locking or barriers since it is fine if the returned
+ * sum is slightly outdated.
+ */
+#define ddcg_sum(ddcg, event_type, prio) ({ \
+ unsigned int cpu; \
+ u32 sum = 0; \
+ \
+ BUILD_BUG_ON(!__same_type((ddcg), struct dd_blkcg *)); \
+ BUILD_BUG_ON(!__same_type((prio), u8)); \
+ for_each_present_cpu(cpu) \
+ sum += local_read(&per_cpu_ptr((ddcg)->stats, cpu)-> \
+ stats[(prio)].event_type); \
+ sum; \
+})
+
+#ifdef CONFIG_BLK_CGROUP
+
+/**
+ * struct dd_blkg - Per (cgroup, request queue) data.
+ * @pd: blkg_policy_data structure.
+ */
+struct dd_blkg {
+ struct blkg_policy_data pd; /* must be the first member */
+};
+
+struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio);
+int dd_activate_policy(struct request_queue *q);
+void dd_deactivate_policy(struct request_queue *q);
+int __init dd_blkcg_init(void);
+void __exit dd_blkcg_exit(void);
+
+#else /* CONFIG_BLK_CGROUP */
+
+static inline struct dd_blkcg *dd_blkcg_from_bio(struct bio *bio)
+{
+ return NULL;
+}
+
+static inline int dd_activate_policy(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void dd_deactivate_policy(struct request_queue *q)
+{
+}
+
+static inline int dd_blkcg_init(void)
+{
+ return 0;
+}
+
+static inline void dd_blkcg_exit(void)
+{
+}
+
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif /* _MQ_DEADLINE_CGROUP_H_ */
diff --git a/block/mq-deadline-main.c b/block/mq-deadline-main.c
new file mode 100644
index 000000000000..6f612e6dc82b
--- /dev/null
+++ b/block/mq-deadline-main.c
@@ -0,0 +1,1175 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ * for the blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-debugfs.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "mq-deadline-cgroup.h"
+
+/*
+ * See Documentation/block/deadline-iosched.rst
+ */
+static const int read_expire = HZ / 2; /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int aging_expire = 10 * HZ;
+static const int writes_starved = 2; /* max times reads can starve a write */
+static const int fifo_batch = 16; /* # of sequential requests treated as one
+ by the above parameters. For throughput. */
+
+enum dd_data_dir {
+ DD_READ = READ,
+ DD_WRITE = WRITE,
+};
+
+enum { DD_DIR_COUNT = 2 };
+
+enum dd_prio {
+ DD_RT_PRIO = 0,
+ DD_BE_PRIO = 1,
+ DD_IDLE_PRIO = 2,
+ DD_PRIO_MAX = 2,
+};
+
+enum { DD_PRIO_COUNT = 3 };
+
+/* I/O statistics for all I/O priorities (enum dd_prio). */
+struct io_stats {
+ struct io_stats_per_prio stats[DD_PRIO_COUNT];
+};
+
+/*
+ * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
+ * present on both sort_list[] and fifo_list[].
+ */
+struct dd_per_prio {
+ struct list_head dispatch;
+ struct rb_root sort_list[DD_DIR_COUNT];
+ struct list_head fifo_list[DD_DIR_COUNT];
+ /* Next request in FIFO order. Read, write or both are NULL. */
+ struct request *next_rq[DD_DIR_COUNT];
+};
+
+struct deadline_data {
+ /*
+ * run time data
+ */
+
+ /* Request queue that owns this data structure. */
+ struct request_queue *queue;
+
+ struct dd_per_prio per_prio[DD_PRIO_COUNT];
+
+ /* Data direction of latest dispatched request. */
+ enum dd_data_dir last_dir;
+ unsigned int batching; /* number of sequential requests made */
+ unsigned int starved; /* times reads have starved writes */
+
+ struct io_stats __percpu *stats;
+
+ /*
+ * settings that change how the i/o scheduler behaves
+ */
+ int fifo_expire[DD_DIR_COUNT];
+ int fifo_batch;
+ int writes_starved;
+ int front_merges;
+ u32 async_depth;
+ int aging_expire;
+
+ spinlock_t lock;
+ spinlock_t zone_lock;
+};
+
+/* Count one event of type 'event_type' and with I/O priority 'prio' */
+#define dd_count(dd, event_type, prio) do { \
+ struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
+ \
+ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
+ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
+ local_inc(&io_stats->stats[(prio)].event_type); \
+ put_cpu_ptr(io_stats); \
+} while (0)
+
+/*
+ * Returns the total number of dd_count(dd, event_type, prio) calls across all
+ * CPUs. No locking or barriers since it is fine if the returned sum is slightly
+ * outdated.
+ */
+#define dd_sum(dd, event_type, prio) ({ \
+ unsigned int cpu; \
+ u32 sum = 0; \
+ \
+ BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
+ BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
+ for_each_present_cpu(cpu) \
+ sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
+ stats[(prio)].event_type); \
+ sum; \
+})
+
+/* Maps an I/O priority class to a deadline scheduler priority. */
+static const enum dd_prio ioprio_class_to_prio[] = {
+ [IOPRIO_CLASS_NONE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_RT] = DD_RT_PRIO,
+ [IOPRIO_CLASS_BE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO,
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
+{
+ return &per_prio->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
+ * request.
+ */
+static u8 dd_rq_ioclass(struct request *rq)
+{
+ return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+ struct rb_node *node = rb_next(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
+{
+ struct rb_root *root = deadline_rb_root(per_prio, rq);
+
+ elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
+{
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+
+ if (per_prio->next_rq[data_dir] == rq)
+ per_prio->next_rq[data_dir] = deadline_latter_request(rq);
+
+ elv_rb_del(deadline_rb_root(per_prio, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q,
+ struct dd_per_prio *per_prio,
+ struct request *rq)
+{
+ list_del_init(&rq->queuelist);
+
+ /*
+ * We might not be on the rbtree, if we are doing an insert merge
+ */
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ deadline_del_rq_rb(per_prio, rq);
+
+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+ enum elv_merge type)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(req);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ /*
+ * if the merge was a front merge, we need to reposition request
+ */
+ if (type == ELEVATOR_FRONT_MERGE) {
+ elv_rb_del(deadline_rb_root(per_prio, req), req);
+ deadline_add_rq_rb(per_prio, req);
+ }
+}
+
+/*
+ * Callback function that is invoked after @next has been merged into @req.
+ */
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+ struct request *next)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(next);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_blkcg *blkcg = next->elv.priv[0];
+
+ dd_count(dd, merged, prio);
+ ddcg_count(blkcg, merged, ioprio_class);
+
+ /*
+ * if next expires before rq, assign its expire time to rq
+ * and move into next position (next will be deleted) in fifo
+ */
+ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+ if (time_before((unsigned long)next->fifo_time,
+ (unsigned long)req->fifo_time)) {
+ list_move(&req->queuelist, &next->queuelist);
+ req->fifo_time = next->fifo_time;
+ }
+ }
+
+ /*
+ * kill knowledge of next, this one is a goner
+ */
+ deadline_remove_request(q, &dd->per_prio[prio], next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ struct request *rq)
+{
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+
+ per_prio->next_rq[data_dir] = deadline_latter_request(rq);
+
+ /*
+ * take it off the sort and fifo list
+ */
+ deadline_remove_request(rq->q, per_prio, rq);
+}
+
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+ return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+
+ /*
+ * rq is expired!
+ */
+ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+ return 1;
+
+ return 0;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using arrival ordered lists.
+ */
+static struct request *
+deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ if (list_empty(&per_prio->fifo_list[data_dir]))
+ return NULL;
+
+ rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ goto out;
+ }
+ rq = NULL;
+out:
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
+ * For the specified data direction, return the next request to
+ * dispatch using sector position sorted lists.
+ */
+static struct request *
+deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ rq = per_prio->next_rq[data_dir];
+ if (!rq)
+ return NULL;
+
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
+ return rq;
+
+ /*
+ * Look for a write request that can be dispatched, that is one with
+ * an unlocked target zone.
+ */
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ while (rq) {
+ if (blk_req_can_dispatch_to_zone(rq))
+ break;
+ rq = deadline_latter_request(rq);
+ }
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ return rq;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc and with a start time <= @latest.
+ */
+static struct request *__dd_dispatch_request(struct deadline_data *dd,
+ struct dd_per_prio *per_prio,
+ u64 latest_start_ns)
+{
+ struct request *rq, *next_rq;
+ enum dd_data_dir data_dir;
+ struct dd_blkcg *blkcg;
+ enum dd_prio prio;
+ u8 ioprio_class;
+
+ lockdep_assert_held(&dd->lock);
+
+ if (!list_empty(&per_prio->dispatch)) {
+ rq = list_first_entry(&per_prio->dispatch, struct request,
+ queuelist);
+ if (rq->start_time_ns > latest_start_ns)
+ return NULL;
+ list_del_init(&rq->queuelist);
+ goto done;
+ }
+
+ /*
+ * batches are currently reads XOR writes
+ */
+ rq = deadline_next_request(dd, per_prio, dd->last_dir);
+ if (rq && dd->batching < dd->fifo_batch)
+ /* we have a next request are still entitled to batch */
+ goto dispatch_request;
+
+ /*
+ * at this point we are not running a batch. select the appropriate
+ * data direction (read / write)
+ */
+
+ if (!list_empty(&per_prio->fifo_list[DD_READ])) {
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
+
+ if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
+ (dd->starved++ >= dd->writes_starved))
+ goto dispatch_writes;
+
+ data_dir = DD_READ;
+
+ goto dispatch_find_request;
+ }
+
+ /*
+ * there are either no reads or writes have been starved
+ */
+
+ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
+dispatch_writes:
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
+
+ dd->starved = 0;
+
+ data_dir = DD_WRITE;
+
+ goto dispatch_find_request;
+ }
+
+ return NULL;
+
+dispatch_find_request:
+ /*
+ * we are not running a batch, find best request for selected data_dir
+ */
+ next_rq = deadline_next_request(dd, per_prio, data_dir);
+ if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
+ /*
+ * A deadline has expired, the last request was in the other
+ * direction, or we have run out of higher-sectored requests.
+ * Start again from the request with the earliest expiry time.
+ */
+ rq = deadline_fifo_request(dd, per_prio, data_dir);
+ } else {
+ /*
+ * The last req was the same dir and we have a next request in
+ * sort order. No expired requests so continue on from here.
+ */
+ rq = next_rq;
+ }
+
+ /*
+ * For a zoned block device, if we only have writes queued and none of
+ * them can be dispatched, rq will be NULL.
+ */
+ if (!rq)
+ return NULL;
+
+ dd->last_dir = data_dir;
+ dd->batching = 0;
+
+dispatch_request:
+ if (rq->start_time_ns > latest_start_ns)
+ return NULL;
+ /*
+ * rq is the selected appropriate request.
+ */
+ dd->batching++;
+ deadline_move_request(dd, per_prio, rq);
+done:
+ ioprio_class = dd_rq_ioclass(rq);
+ prio = ioprio_class_to_prio[ioprio_class];
+ dd_count(dd, dispatched, prio);
+ blkcg = rq->elv.priv[0];
+ ddcg_count(blkcg, dispatched, ioprio_class);
+ /*
+ * If the request needs its target zone locked, do it.
+ */
+ blk_req_zone_write_lock(rq);
+ rq->rq_flags |= RQF_STARTED;
+ return rq;
+}
+
+/*
+ * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
+ *
+ * One confusing aspect here is that we get called for a specific
+ * hardware queue, but we may return a request that is for a
+ * different hardware queue. This is because mq-deadline has shared
+ * state for all hardware queues, in terms of sorting, FIFOs, etc.
+ */
+static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ const u64 now_ns = ktime_get_ns();
+ struct request *rq = NULL;
+ enum dd_prio prio;
+
+ spin_lock(&dd->lock);
+ /*
+ * Start with dispatching requests whose deadline expired more than
+ * aging_expire jiffies ago.
+ */
+ for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns -
+ jiffies_to_nsecs(dd->aging_expire));
+ if (rq)
+ goto unlock;
+ }
+ /*
+ * Next, dispatch requests in priority order. Ignore lower priority
+ * requests if any higher priority requests are pending.
+ */
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now_ns);
+ if (rq || dd_queued(dd, prio))
+ break;
+ }
+
+unlock:
+ spin_unlock(&dd->lock);
+
+ return rq;
+}
+
+/*
+ * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
+ * function is used by __blk_mq_get_tag().
+ */
+static void dd_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+{
+ struct deadline_data *dd = data->q->elevator->elevator_data;
+
+ /* Do not throttle synchronous reads. */
+ if (op_is_sync(op) && !op_is_write(op))
+ return;
+
+ /*
+ * Throttle asynchronous requests and writes such that these requests
+ * do not block the allocation of synchronous requests.
+ */
+ data->shallow_depth = dd->async_depth;
+}
+
+/* Called by blk_mq_update_nr_requests(). */
+static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+
+ dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
+
+ sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+}
+
+/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
+static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ dd_depth_updated(hctx);
+ return 0;
+}
+
+static void dd_exit_sched(struct elevator_queue *e)
+{
+ struct deadline_data *dd = e->elevator_data;
+ enum dd_prio prio;
+
+ dd_deactivate_policy(dd->queue);
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
+ }
+
+ free_percpu(dd->stats);
+
+ kfree(dd);
+}
+
+/*
+ * Initialize elevator private data (deadline_data) and associate with blkcg.
+ */
+static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+ struct deadline_data *dd;
+ struct elevator_queue *eq;
+ enum dd_prio prio;
+ int ret = -ENOMEM;
+
+ /*
+ * Initialization would be very tricky if the queue is not frozen,
+ * hence the warning statement below.
+ */
+ WARN_ON_ONCE(!percpu_ref_is_zero(&q->q_usage_counter));
+
+ eq = elevator_alloc(q, e);
+ if (!eq)
+ return ret;
+
+ dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+ if (!dd)
+ goto put_eq;
+
+ eq->elevator_data = dd;
+
+ dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!dd->stats)
+ goto free_dd;
+
+ dd->queue = q;
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ INIT_LIST_HEAD(&per_prio->dispatch);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
+ per_prio->sort_list[DD_READ] = RB_ROOT;
+ per_prio->sort_list[DD_WRITE] = RB_ROOT;
+ }
+ dd->fifo_expire[DD_READ] = read_expire;
+ dd->fifo_expire[DD_WRITE] = write_expire;
+ dd->writes_starved = writes_starved;
+ dd->front_merges = 1;
+ dd->last_dir = DD_WRITE;
+ dd->fifo_batch = fifo_batch;
+ dd->aging_expire = aging_expire;
+ spin_lock_init(&dd->lock);
+ spin_lock_init(&dd->zone_lock);
+
+ ret = dd_activate_policy(q);
+ if (ret)
+ goto free_stats;
+
+ ret = 0;
+ q->elevator = eq;
+ return 0;
+
+free_stats:
+ free_percpu(dd->stats);
+
+free_dd:
+ kfree(dd);
+
+put_eq:
+ kobject_put(&eq->kobj);
+ return ret;
+}
+
+/*
+ * Try to merge @bio into an existing request. If @bio has been merged into
+ * an existing request, store the pointer to that request into *@rq.
+ */
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+ struct bio *bio)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ sector_t sector = bio_end_sector(bio);
+ struct request *__rq;
+
+ if (!dd->front_merges)
+ return ELEVATOR_NO_MERGE;
+
+ __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
+ if (__rq) {
+ BUG_ON(sector != blk_rq_pos(__rq));
+
+ if (elv_bio_merge_ok(__rq, bio)) {
+ *rq = __rq;
+ return ELEVATOR_FRONT_MERGE;
+ }
+ }
+
+ return ELEVATOR_NO_MERGE;
+}
+
+/*
+ * Attempt to merge a bio into an existing request. This function is called
+ * before @bio is associated with a request.
+ */
+static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
+{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct request *free = NULL;
+ bool ret;
+
+ spin_lock(&dd->lock);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ spin_unlock(&dd->lock);
+
+ if (free)
+ blk_mq_free_request(free);
+
+ return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+ u16 ioprio = req_get_ioprio(rq);
+ u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+ struct dd_per_prio *per_prio;
+ enum dd_prio prio;
+ struct dd_blkcg *blkcg;
+ LIST_HEAD(free);
+
+ lockdep_assert_held(&dd->lock);
+
+ /*
+ * This may be a requeue of a write request that has locked its
+ * target zone. If it is the case, this releases the zone lock.
+ */
+ blk_req_zone_write_unlock(rq);
+
+ /*
+ * If a block cgroup has been associated with the submitter and if an
+ * I/O priority has been set in the associated block cgroup, use the
+ * lowest of the cgroup priority and the request priority for the
+ * request. If no priority has been set in the request, use the cgroup
+ * priority.
+ */
+ prio = ioprio_class_to_prio[ioprio_class];
+ dd_count(dd, inserted, prio);
+ blkcg = dd_blkcg_from_bio(rq->bio);
+ ddcg_count(blkcg, inserted, ioprio_class);
+ rq->elv.priv[0] = blkcg;
+
+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
+ blk_mq_free_requests(&free);
+ return;
+ }
+
+ trace_block_rq_insert(rq);
+
+ per_prio = &dd->per_prio[prio];
+ if (at_head) {
+ list_add(&rq->queuelist, &per_prio->dispatch);
+ } else {
+ deadline_add_rq_rb(per_prio, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+
+ /*
+ * set expire time and add to fifo list
+ */
+ rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+ list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]);
+ }
+}
+
+/*
+ * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests().
+ */
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ spin_lock(&dd->lock);
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ dd_insert_request(hctx, rq, at_head);
+ }
+ spin_unlock(&dd->lock);
+}
+
+/* Callback from inside blk_mq_rq_ctx_init(). */
+static void dd_prepare_request(struct request *rq)
+{
+ rq->elv.priv[0] = NULL;
+}
+
+/*
+ * Callback from inside blk_mq_free_request().
+ *
+ * For zoned block devices, write unlock the target zone of
+ * completed write requests. Do this while holding the zone lock
+ * spinlock so that the zone is never unlocked while deadline_fifo_request()
+ * or deadline_next_request() are executing. This function is called for
+ * all requests, whether or not these requests complete successfully.
+ *
+ * For a zoned block device, __dd_dispatch_request() may have stopped
+ * dispatching requests if all the queued requests are write requests directed
+ * at zones that are already locked due to on-going write requests. To ensure
+ * write request dispatch progress in this case, mark the queue as needing a
+ * restart to ensure that the queue is run again after completion of the
+ * request and zones being unlocked.
+ */
+static void dd_finish_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct dd_blkcg *blkcg = rq->elv.priv[0];
+ const u8 ioprio_class = dd_rq_ioclass(rq);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ dd_count(dd, completed, prio);
+ ddcg_count(blkcg, completed, ioprio_class);
+
+ if (blk_queue_is_zoned(q)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dd->zone_lock, flags);
+ blk_req_zone_write_unlock(rq);
+ if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
+ spin_unlock_irqrestore(&dd->zone_lock, flags);
+ }
+}
+
+static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
+{
+ return !list_empty_careful(&per_prio->dispatch) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio prio;
+
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++)
+ if (dd_has_work_for_prio(&dd->per_prio[prio]))
+ return true;
+
+ return false;
+}
+
+/*
+ * sysfs parts below
+ */
+#define SHOW_INT(__FUNC, __VAR) \
+static ssize_t __FUNC(struct elevator_queue *e, char *page) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ \
+ return sysfs_emit(page, "%d\n", __VAR); \
+}
+#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
+SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
+SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_aging_expire_show, dd->aging_expire);
+SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
+SHOW_INT(deadline_front_merges_show, dd->front_merges);
+SHOW_INT(deadline_async_depth_show, dd->front_merges);
+SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
+#undef SHOW_INT
+#undef SHOW_JIFFIES
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
+{ \
+ struct deadline_data *dd = e->elevator_data; \
+ int __data, __ret; \
+ \
+ __ret = kstrtoint(page, 0, &__data); \
+ if (__ret < 0) \
+ return __ret; \
+ if (__data < (MIN)) \
+ __data = (MIN); \
+ else if (__data > (MAX)) \
+ __data = (MAX); \
+ *(__PTR) = __CONV(__data); \
+ return count; \
+}
+#define STORE_INT(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
+#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
+STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
+STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_aging_expire_store, &dd->aging_expire, 0, INT_MAX);
+STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
+STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
+STORE_INT(deadline_async_depth_store, &dd->front_merges, 1, INT_MAX);
+STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
+#undef STORE_FUNCTION
+#undef STORE_INT
+#undef STORE_JIFFIES
+
+#define DD_ATTR(name) \
+ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+ DD_ATTR(read_expire),
+ DD_ATTR(write_expire),
+ DD_ATTR(writes_starved),
+ DD_ATTR(front_merges),
+ DD_ATTR(async_depth),
+ DD_ATTR(fifo_batch),
+ DD_ATTR(aging_expire),
+ __ATTR_NULL
+};
+
+#ifdef CONFIG_BLK_DEBUG_FS
+#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \
+static void *deadline_##name##_fifo_start(struct seq_file *m, \
+ loff_t *pos) \
+ __acquires(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ spin_lock(&dd->lock); \
+ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \
+} \
+ \
+static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
+ loff_t *pos) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \
+} \
+ \
+static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
+ __releases(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ \
+ spin_unlock(&dd->lock); \
+} \
+ \
+static const struct seq_operations deadline_##name##_fifo_seq_ops = { \
+ .start = deadline_##name##_fifo_start, \
+ .next = deadline_##name##_fifo_next, \
+ .stop = deadline_##name##_fifo_stop, \
+ .show = blk_mq_debugfs_rq_show, \
+}; \
+ \
+static int deadline_##name##_next_rq_show(void *data, \
+ struct seq_file *m) \
+{ \
+ struct request_queue *q = data; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ struct request *rq = per_prio->next_rq[data_dir]; \
+ \
+ if (rq) \
+ __blk_mq_debugfs_rq_show(m, rq); \
+ return 0; \
+}
+
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
+#undef DEADLINE_DEBUGFS_DDIR_ATTRS
+
+static int deadline_batching_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->batching);
+ return 0;
+}
+
+static int deadline_starved_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->starved);
+ return 0;
+}
+
+static int dd_async_depth_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u\n", dd->async_depth);
+ return 0;
+}
+
+static int dd_queued_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
+ dd_queued(dd, DD_BE_PRIO),
+ dd_queued(dd, DD_IDLE_PRIO));
+ return 0;
+}
+
+/* Number of requests owned by the block driver for a given priority. */
+static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
+{
+ return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
+ - dd_sum(dd, completed, prio);
+}
+
+static int dd_owned_by_driver_show(void *data, struct seq_file *m)
+{
+ struct request_queue *q = data;
+ struct deadline_data *dd = q->elevator->elevator_data;
+
+ seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
+ dd_owned_by_driver(dd, DD_BE_PRIO),
+ dd_owned_by_driver(dd, DD_IDLE_PRIO));
+ return 0;
+}
+
+#define DEADLINE_DISPATCH_ATTR(prio) \
+static void *deadline_dispatch##prio##_start(struct seq_file *m, \
+ loff_t *pos) \
+ __acquires(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ spin_lock(&dd->lock); \
+ return seq_list_start(&per_prio->dispatch, *pos); \
+} \
+ \
+static void *deadline_dispatch##prio##_next(struct seq_file *m, \
+ void *v, loff_t *pos) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ return seq_list_next(v, &per_prio->dispatch, pos); \
+} \
+ \
+static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \
+ __releases(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ \
+ spin_unlock(&dd->lock); \
+} \
+ \
+static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
+ .start = deadline_dispatch##prio##_start, \
+ .next = deadline_dispatch##prio##_next, \
+ .stop = deadline_dispatch##prio##_stop, \
+ .show = blk_mq_debugfs_rq_show, \
+}
+
+DEADLINE_DISPATCH_ATTR(0);
+DEADLINE_DISPATCH_ATTR(1);
+DEADLINE_DISPATCH_ATTR(2);
+#undef DEADLINE_DISPATCH_ATTR
+
+#define DEADLINE_QUEUE_DDIR_ATTRS(name) \
+ {#name "_fifo_list", 0400, \
+ .seq_ops = &deadline_##name##_fifo_seq_ops}
+#define DEADLINE_NEXT_RQ_ATTR(name) \
+ {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
+static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
+ DEADLINE_QUEUE_DDIR_ATTRS(read0),
+ DEADLINE_QUEUE_DDIR_ATTRS(write0),
+ DEADLINE_QUEUE_DDIR_ATTRS(read1),
+ DEADLINE_QUEUE_DDIR_ATTRS(write1),
+ DEADLINE_QUEUE_DDIR_ATTRS(read2),
+ DEADLINE_QUEUE_DDIR_ATTRS(write2),
+ DEADLINE_NEXT_RQ_ATTR(read0),
+ DEADLINE_NEXT_RQ_ATTR(write0),
+ DEADLINE_NEXT_RQ_ATTR(read1),
+ DEADLINE_NEXT_RQ_ATTR(write1),
+ DEADLINE_NEXT_RQ_ATTR(read2),
+ DEADLINE_NEXT_RQ_ATTR(write2),
+ {"batching", 0400, deadline_batching_show},
+ {"starved", 0400, deadline_starved_show},
+ {"async_depth", 0400, dd_async_depth_show},
+ {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
+ {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
+ {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
+ {"owned_by_driver", 0400, dd_owned_by_driver_show},
+ {"queued", 0400, dd_queued_show},
+ {},
+};
+#undef DEADLINE_QUEUE_DDIR_ATTRS
+#endif
+
+static struct elevator_type mq_deadline = {
+ .ops = {
+ .depth_updated = dd_depth_updated,
+ .limit_depth = dd_limit_depth,
+ .insert_requests = dd_insert_requests,
+ .dispatch_request = dd_dispatch_request,
+ .prepare_request = dd_prepare_request,
+ .finish_request = dd_finish_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .bio_merge = dd_bio_merge,
+ .request_merge = dd_request_merge,
+ .requests_merged = dd_merged_requests,
+ .request_merged = dd_request_merged,
+ .has_work = dd_has_work,
+ .init_sched = dd_init_sched,
+ .exit_sched = dd_exit_sched,
+ .init_hctx = dd_init_hctx,
+ },
+
+#ifdef CONFIG_BLK_DEBUG_FS
+ .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
+#endif
+ .elevator_attrs = deadline_attrs,
+ .elevator_name = "mq-deadline",
+ .elevator_alias = "deadline",
+ .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
+ .elevator_owner = THIS_MODULE,
+};
+MODULE_ALIAS("mq-deadline-iosched");
+
+static int __init deadline_init(void)
+{
+ int ret;
+
+ ret = elv_register(&mq_deadline);
+ if (ret)
+ goto out;
+ ret = dd_blkcg_init();
+ if (ret)
+ goto unreg;
+
+out:
+ return ret;
+
+unreg:
+ elv_unregister(&mq_deadline);
+ goto out;
+}
+
+static void __exit deadline_exit(void)
+{
+ dd_blkcg_exit();
+ elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
deleted file mode 100644
index 8eea2cbf2bf4..000000000000
--- a/block/mq-deadline.c
+++ /dev/null
@@ -1,815 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
- * for the blk-mq scheduling framework
- *
- * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
- */
-#include <linux/kernel.h>
-#include <linux/fs.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/elevator.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/compiler.h>
-#include <linux/rbtree.h>
-#include <linux/sbitmap.h>
-
-#include <trace/events/block.h>
-
-#include "blk.h"
-#include "blk-mq.h"
-#include "blk-mq-debugfs.h"
-#include "blk-mq-tag.h"
-#include "blk-mq-sched.h"
-
-/*
- * See Documentation/block/deadline-iosched.rst
- */
-static const int read_expire = HZ / 2; /* max time before a read is submitted. */
-static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
-static const int writes_starved = 2; /* max times reads can starve a write */
-static const int fifo_batch = 16; /* # of sequential requests treated as one
- by the above parameters. For throughput. */
-
-struct deadline_data {
- /*
- * run time data
- */
-
- /*
- * requests (deadline_rq s) are present on both sort_list and fifo_list
- */
- struct rb_root sort_list[2];
- struct list_head fifo_list[2];
-
- /*
- * next in sort order. read, write or both are NULL
- */
- struct request *next_rq[2];
- unsigned int batching; /* number of sequential requests made */
- unsigned int starved; /* times reads have starved writes */
-
- /*
- * settings that change how the i/o scheduler behaves
- */
- int fifo_expire[2];
- int fifo_batch;
- int writes_starved;
- int front_merges;
-
- spinlock_t lock;
- spinlock_t zone_lock;
- struct list_head dispatch;
-};
-
-static inline struct rb_root *
-deadline_rb_root(struct deadline_data *dd, struct request *rq)
-{
- return &dd->sort_list[rq_data_dir(rq)];
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
- struct rb_node *node = rb_next(&rq->rb_node);
-
- if (node)
- return rb_entry_rq(node);
-
- return NULL;
-}
-
-static void
-deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
-{
- struct rb_root *root = deadline_rb_root(dd, rq);
-
- elv_rb_add(root, rq);
-}
-
-static inline void
-deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
-{
- const int data_dir = rq_data_dir(rq);
-
- if (dd->next_rq[data_dir] == rq)
- dd->next_rq[data_dir] = deadline_latter_request(rq);
-
- elv_rb_del(deadline_rb_root(dd, rq), rq);
-}
-
-/*
- * remove rq from rbtree and fifo.
- */
-static void deadline_remove_request(struct request_queue *q, struct request *rq)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
-
- list_del_init(&rq->queuelist);
-
- /*
- * We might not be on the rbtree, if we are doing an insert merge
- */
- if (!RB_EMPTY_NODE(&rq->rb_node))
- deadline_del_rq_rb(dd, rq);
-
- elv_rqhash_del(q, rq);
- if (q->last_merge == rq)
- q->last_merge = NULL;
-}
-
-static void dd_request_merged(struct request_queue *q, struct request *req,
- enum elv_merge type)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
-
- /*
- * if the merge was a front merge, we need to reposition request
- */
- if (type == ELEVATOR_FRONT_MERGE) {
- elv_rb_del(deadline_rb_root(dd, req), req);
- deadline_add_rq_rb(dd, req);
- }
-}
-
-static void dd_merged_requests(struct request_queue *q, struct request *req,
- struct request *next)
-{
- /*
- * if next expires before rq, assign its expire time to rq
- * and move into next position (next will be deleted) in fifo
- */
- if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
- if (time_before((unsigned long)next->fifo_time,
- (unsigned long)req->fifo_time)) {
- list_move(&req->queuelist, &next->queuelist);
- req->fifo_time = next->fifo_time;
- }
- }
-
- /*
- * kill knowledge of next, this one is a goner
- */
- deadline_remove_request(q, next);
-}
-
-/*
- * move an entry to dispatch queue
- */
-static void
-deadline_move_request(struct deadline_data *dd, struct request *rq)
-{
- const int data_dir = rq_data_dir(rq);
-
- dd->next_rq[READ] = NULL;
- dd->next_rq[WRITE] = NULL;
- dd->next_rq[data_dir] = deadline_latter_request(rq);
-
- /*
- * take it off the sort and fifo list
- */
- deadline_remove_request(rq->q, rq);
-}
-
-/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
- * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
- */
-static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
-{
- struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
-
- /*
- * rq is expired!
- */
- if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
- return 1;
-
- return 0;
-}
-
-/*
- * For the specified data direction, return the next request to
- * dispatch using arrival ordered lists.
- */
-static struct request *
-deadline_fifo_request(struct deadline_data *dd, int data_dir)
-{
- struct request *rq;
- unsigned long flags;
-
- if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
- return NULL;
-
- if (list_empty(&dd->fifo_list[data_dir]))
- return NULL;
-
- rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
- if (data_dir == READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
- if (blk_req_can_dispatch_to_zone(rq))
- goto out;
- }
- rq = NULL;
-out:
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
-}
-
-/*
- * For the specified data direction, return the next request to
- * dispatch using sector position sorted lists.
- */
-static struct request *
-deadline_next_request(struct deadline_data *dd, int data_dir)
-{
- struct request *rq;
- unsigned long flags;
-
- if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
- return NULL;
-
- rq = dd->next_rq[data_dir];
- if (!rq)
- return NULL;
-
- if (data_dir == READ || !blk_queue_is_zoned(rq->q))
- return rq;
-
- /*
- * Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
- */
- spin_lock_irqsave(&dd->zone_lock, flags);
- while (rq) {
- if (blk_req_can_dispatch_to_zone(rq))
- break;
- rq = deadline_latter_request(rq);
- }
- spin_unlock_irqrestore(&dd->zone_lock, flags);
-
- return rq;
-}
-
-/*
- * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
- */
-static struct request *__dd_dispatch_request(struct deadline_data *dd)
-{
- struct request *rq, *next_rq;
- bool reads, writes;
- int data_dir;
-
- if (!list_empty(&dd->dispatch)) {
- rq = list_first_entry(&dd->dispatch, struct request, queuelist);
- list_del_init(&rq->queuelist);
- goto done;
- }
-
- reads = !list_empty(&dd->fifo_list[READ]);
- writes = !list_empty(&dd->fifo_list[WRITE]);
-
- /*
- * batches are currently reads XOR writes
- */
- rq = deadline_next_request(dd, WRITE);
- if (!rq)
- rq = deadline_next_request(dd, READ);
-
- if (rq && dd->batching < dd->fifo_batch)
- /* we have a next request are still entitled to batch */
- goto dispatch_request;
-
- /*
- * at this point we are not running a batch. select the appropriate
- * data direction (read / write)
- */
-
- if (reads) {
- BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
-
- if (deadline_fifo_request(dd, WRITE) &&
- (dd->starved++ >= dd->writes_starved))
- goto dispatch_writes;
-
- data_dir = READ;
-
- goto dispatch_find_request;
- }
-
- /*
- * there are either no reads or writes have been starved
- */
-
- if (writes) {
-dispatch_writes:
- BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
-
- dd->starved = 0;
-
- data_dir = WRITE;
-
- goto dispatch_find_request;
- }
-
- return NULL;
-
-dispatch_find_request:
- /*
- * we are not running a batch, find best request for selected data_dir
- */
- next_rq = deadline_next_request(dd, data_dir);
- if (deadline_check_fifo(dd, data_dir) || !next_rq) {
- /*
- * A deadline has expired, the last request was in the other
- * direction, or we have run out of higher-sectored requests.
- * Start again from the request with the earliest expiry time.
- */
- rq = deadline_fifo_request(dd, data_dir);
- } else {
- /*
- * The last req was the same dir and we have a next request in
- * sort order. No expired requests so continue on from here.
- */
- rq = next_rq;
- }
-
- /*
- * For a zoned block device, if we only have writes queued and none of
- * them can be dispatched, rq will be NULL.
- */
- if (!rq)
- return NULL;
-
- dd->batching = 0;
-
-dispatch_request:
- /*
- * rq is the selected appropriate request.
- */
- dd->batching++;
- deadline_move_request(dd, rq);
-done:
- /*
- * If the request needs its target zone locked, do it.
- */
- blk_req_zone_write_lock(rq);
- rq->rq_flags |= RQF_STARTED;
- return rq;
-}
-
-/*
- * One confusing aspect here is that we get called for a specific
- * hardware queue, but we may return a request that is for a
- * different hardware queue. This is because mq-deadline has shared
- * state for all hardware queues, in terms of sorting, FIFOs, etc.
- */
-static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
- struct request *rq;
-
- spin_lock(&dd->lock);
- rq = __dd_dispatch_request(dd);
- spin_unlock(&dd->lock);
-
- return rq;
-}
-
-static void dd_exit_queue(struct elevator_queue *e)
-{
- struct deadline_data *dd = e->elevator_data;
-
- BUG_ON(!list_empty(&dd->fifo_list[READ]));
- BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
-
- kfree(dd);
-}
-
-/*
- * initialize elevator private data (deadline_data).
- */
-static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
-{
- struct deadline_data *dd;
- struct elevator_queue *eq;
-
- eq = elevator_alloc(q, e);
- if (!eq)
- return -ENOMEM;
-
- dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
- if (!dd) {
- kobject_put(&eq->kobj);
- return -ENOMEM;
- }
- eq->elevator_data = dd;
-
- INIT_LIST_HEAD(&dd->fifo_list[READ]);
- INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
- dd->sort_list[READ] = RB_ROOT;
- dd->sort_list[WRITE] = RB_ROOT;
- dd->fifo_expire[READ] = read_expire;
- dd->fifo_expire[WRITE] = write_expire;
- dd->writes_starved = writes_starved;
- dd->front_merges = 1;
- dd->fifo_batch = fifo_batch;
- spin_lock_init(&dd->lock);
- spin_lock_init(&dd->zone_lock);
- INIT_LIST_HEAD(&dd->dispatch);
-
- q->elevator = eq;
- return 0;
-}
-
-static int dd_request_merge(struct request_queue *q, struct request **rq,
- struct bio *bio)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
- sector_t sector = bio_end_sector(bio);
- struct request *__rq;
-
- if (!dd->front_merges)
- return ELEVATOR_NO_MERGE;
-
- __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
- if (__rq) {
- BUG_ON(sector != blk_rq_pos(__rq));
-
- if (elv_bio_merge_ok(__rq, bio)) {
- *rq = __rq;
- return ELEVATOR_FRONT_MERGE;
- }
- }
-
- return ELEVATOR_NO_MERGE;
-}
-
-static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
- struct request *free = NULL;
- bool ret;
-
- spin_lock(&dd->lock);
- ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
- spin_unlock(&dd->lock);
-
- if (free)
- blk_mq_free_request(free);
-
- return ret;
-}
-
-/*
- * add rq to rbtree and fifo
- */
-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head)
-{
- struct request_queue *q = hctx->queue;
- struct deadline_data *dd = q->elevator->elevator_data;
- const int data_dir = rq_data_dir(rq);
-
- /*
- * This may be a requeue of a write request that has locked its
- * target zone. If it is the case, this releases the zone lock.
- */
- blk_req_zone_write_unlock(rq);
-
- if (blk_mq_sched_try_insert_merge(q, rq))
- return;
-
- trace_block_rq_insert(rq);
-
- if (at_head) {
- list_add(&rq->queuelist, &dd->dispatch);
- } else {
- deadline_add_rq_rb(dd, rq);
-
- if (rq_mergeable(rq)) {
- elv_rqhash_add(q, rq);
- if (!q->last_merge)
- q->last_merge = rq;
- }
-
- /*
- * set expire time and add to fifo list
- */
- rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
- list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
- }
-}
-
-static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct list_head *list, bool at_head)
-{
- struct request_queue *q = hctx->queue;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- spin_lock(&dd->lock);
- while (!list_empty(list)) {
- struct request *rq;
-
- rq = list_first_entry(list, struct request, queuelist);
- list_del_init(&rq->queuelist);
- dd_insert_request(hctx, rq, at_head);
- }
- spin_unlock(&dd->lock);
-}
-
-/*
- * Nothing to do here. This is defined only to ensure that .finish_request
- * method is called upon request completion.
- */
-static void dd_prepare_request(struct request *rq)
-{
-}
-
-/*
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
- */
-static void dd_finish_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
-
- if (blk_queue_is_zoned(q)) {
- struct deadline_data *dd = q->elevator->elevator_data;
- unsigned long flags;
-
- spin_lock_irqsave(&dd->zone_lock, flags);
- blk_req_zone_write_unlock(rq);
- if (!list_empty(&dd->fifo_list[WRITE]))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
- spin_unlock_irqrestore(&dd->zone_lock, flags);
- }
-}
-
-static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
-{
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-
- return !list_empty_careful(&dd->dispatch) ||
- !list_empty_careful(&dd->fifo_list[0]) ||
- !list_empty_careful(&dd->fifo_list[1]);
-}
-
-/*
- * sysfs parts below
- */
-static ssize_t
-deadline_var_show(int var, char *page)
-{
- return sprintf(page, "%d\n", var);
-}
-
-static void
-deadline_var_store(int *var, const char *page)
-{
- char *p = (char *) page;
-
- *var = simple_strtol(p, &p, 10);
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
-static ssize_t __FUNC(struct elevator_queue *e, char *page) \
-{ \
- struct deadline_data *dd = e->elevator_data; \
- int __data = __VAR; \
- if (__CONV) \
- __data = jiffies_to_msecs(__data); \
- return deadline_var_show(__data, (page)); \
-}
-SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
-#undef SHOW_FUNCTION
-
-#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
-static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
-{ \
- struct deadline_data *dd = e->elevator_data; \
- int __data; \
- deadline_var_store(&__data, (page)); \
- if (__data < (MIN)) \
- __data = (MIN); \
- else if (__data > (MAX)) \
- __data = (MAX); \
- if (__CONV) \
- *(__PTR) = msecs_to_jiffies(__data); \
- else \
- *(__PTR) = __data; \
- return count; \
-}
-STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
-#undef STORE_FUNCTION
-
-#define DD_ATTR(name) \
- __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
-
-static struct elv_fs_entry deadline_attrs[] = {
- DD_ATTR(read_expire),
- DD_ATTR(write_expire),
- DD_ATTR(writes_starved),
- DD_ATTR(front_merges),
- DD_ATTR(fifo_batch),
- __ATTR_NULL
-};
-
-#ifdef CONFIG_BLK_DEBUG_FS
-#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \
-static void *deadline_##name##_fifo_start(struct seq_file *m, \
- loff_t *pos) \
- __acquires(&dd->lock) \
-{ \
- struct request_queue *q = m->private; \
- struct deadline_data *dd = q->elevator->elevator_data; \
- \
- spin_lock(&dd->lock); \
- return seq_list_start(&dd->fifo_list[ddir], *pos); \
-} \
- \
-static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
- loff_t *pos) \
-{ \
- struct request_queue *q = m->private; \
- struct deadline_data *dd = q->elevator->elevator_data; \
- \
- return seq_list_next(v, &dd->fifo_list[ddir], pos); \
-} \
- \
-static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
- __releases(&dd->lock) \
-{ \
- struct request_queue *q = m->private; \
- struct deadline_data *dd = q->elevator->elevator_data; \
- \
- spin_unlock(&dd->lock); \
-} \
- \
-static const struct seq_operations deadline_##name##_fifo_seq_ops = { \
- .start = deadline_##name##_fifo_start, \
- .next = deadline_##name##_fifo_next, \
- .stop = deadline_##name##_fifo_stop, \
- .show = blk_mq_debugfs_rq_show, \
-}; \
- \
-static int deadline_##name##_next_rq_show(void *data, \
- struct seq_file *m) \
-{ \
- struct request_queue *q = data; \
- struct deadline_data *dd = q->elevator->elevator_data; \
- struct request *rq = dd->next_rq[ddir]; \
- \
- if (rq) \
- __blk_mq_debugfs_rq_show(m, rq); \
- return 0; \
-}
-DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read)
-DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write)
-#undef DEADLINE_DEBUGFS_DDIR_ATTRS
-
-static int deadline_batching_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- seq_printf(m, "%u\n", dd->batching);
- return 0;
-}
-
-static int deadline_starved_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- seq_printf(m, "%u\n", dd->starved);
- return 0;
-}
-
-static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&dd->lock)
-{
- struct request_queue *q = m->private;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- spin_lock(&dd->lock);
- return seq_list_start(&dd->dispatch, *pos);
-}
-
-static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
-{
- struct request_queue *q = m->private;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- return seq_list_next(v, &dd->dispatch, pos);
-}
-
-static void deadline_dispatch_stop(struct seq_file *m, void *v)
- __releases(&dd->lock)
-{
- struct request_queue *q = m->private;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- spin_unlock(&dd->lock);
-}
-
-static const struct seq_operations deadline_dispatch_seq_ops = {
- .start = deadline_dispatch_start,
- .next = deadline_dispatch_next,
- .stop = deadline_dispatch_stop,
- .show = blk_mq_debugfs_rq_show,
-};
-
-#define DEADLINE_QUEUE_DDIR_ATTRS(name) \
- {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \
- {#name "_next_rq", 0400, deadline_##name##_next_rq_show}
-static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
- DEADLINE_QUEUE_DDIR_ATTRS(read),
- DEADLINE_QUEUE_DDIR_ATTRS(write),
- {"batching", 0400, deadline_batching_show},
- {"starved", 0400, deadline_starved_show},
- {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
- {},
-};
-#undef DEADLINE_QUEUE_DDIR_ATTRS
-#endif
-
-static struct elevator_type mq_deadline = {
- .ops = {
- .insert_requests = dd_insert_requests,
- .dispatch_request = dd_dispatch_request,
- .prepare_request = dd_prepare_request,
- .finish_request = dd_finish_request,
- .next_request = elv_rb_latter_request,
- .former_request = elv_rb_former_request,
- .bio_merge = dd_bio_merge,
- .request_merge = dd_request_merge,
- .requests_merged = dd_merged_requests,
- .request_merged = dd_request_merged,
- .has_work = dd_has_work,
- .init_sched = dd_init_queue,
- .exit_sched = dd_exit_queue,
- },
-
-#ifdef CONFIG_BLK_DEBUG_FS
- .queue_debugfs_attrs = deadline_queue_debugfs_attrs,
-#endif
- .elevator_attrs = deadline_attrs,
- .elevator_name = "mq-deadline",
- .elevator_alias = "deadline",
- .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
- .elevator_owner = THIS_MODULE,
-};
-MODULE_ALIAS("mq-deadline-iosched");
-
-static int __init deadline_init(void)
-{
- return elv_register(&mq_deadline);
-}
-
-static void __exit deadline_exit(void)
-{
- elv_unregister(&mq_deadline);
-}
-
-module_init(deadline_init);
-module_exit(deadline_exit);
-
-MODULE_AUTHOR("Jens Axboe");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --git a/block/partitions/core.c b/block/partitions/core.c
index dc60ecf46fe6..347c56a51d87 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -120,8 +120,7 @@ static void free_partitions(struct parsed_partitions *state)
kfree(state);
}
-static struct parsed_partitions *check_partition(struct gendisk *hd,
- struct block_device *bdev)
+static struct parsed_partitions *check_partition(struct gendisk *hd)
{
struct parsed_partitions *state;
int i, res, err;
@@ -136,7 +135,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd,
}
state->pp_buf[0] = '\0';
- state->bdev = bdev;
+ state->bdev = hd->part0;
disk_name(hd, 0, state->name);
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
@@ -260,7 +259,8 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
- blk_free_devt(dev->devt);
+ if (MAJOR(dev->devt) == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(MINOR(dev->devt));
bdput(dev_to_bdev(dev));
}
@@ -282,7 +282,7 @@ struct device_type part_type = {
};
/*
- * Must be called either with bd_mutex held, before a disk can be opened or
+ * Must be called either with open_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
static void delete_partition(struct block_device *part)
@@ -311,7 +311,7 @@ static ssize_t whole_disk_show(struct device *dev,
static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
/*
- * Must be called either with bd_mutex held, before a disk can be opened or
+ * Must be called either with open_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
static struct block_device *add_partition(struct gendisk *disk, int partno,
@@ -325,10 +325,8 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
const char *dname;
int err;
- /*
- * disk_max_parts() won't be zero, either GENHD_FL_EXT_DEVT is set
- * or 'minors' is passed to alloc_disk().
- */
+ lockdep_assert_held(&disk->open_mutex);
+
if (partno >= disk_max_parts(disk))
return ERR_PTR(-EINVAL);
@@ -379,9 +377,15 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
pdev->type = &part_type;
pdev->parent = ddev;
- err = blk_alloc_devt(bdev, &devt);
- if (err)
- goto out_put;
+ /* in consecutive minor range? */
+ if (bdev->bd_partno < disk->minors) {
+ devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
+ } else {
+ err = blk_alloc_ext_minor();
+ if (err < 0)
+ goto out_put;
+ devt = MKDEV(BLOCK_EXT_MAJOR, err);
+ }
pdev->devt = devt;
/* delay uevent until 'holders' subdir is created */
@@ -450,29 +454,27 @@ int bdev_add_partition(struct block_device *bdev, int partno,
{
struct block_device *part;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&bdev->bd_disk->open_mutex);
if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
return -EBUSY;
}
part = add_partition(bdev->bd_disk, partno, start, length,
ADDPART_FLAG_NONE, NULL);
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
return PTR_ERR_OR_ZERO(part);
}
int bdev_del_partition(struct block_device *bdev, int partno)
{
- struct block_device *part;
- int ret;
+ struct block_device *part = NULL;
+ int ret = -ENXIO;
- part = bdget_disk(bdev->bd_disk, partno);
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ part = xa_load(&bdev->bd_disk->part_tbl, partno);
if (!part)
- return -ENXIO;
-
- mutex_lock(&part->bd_mutex);
- mutex_lock_nested(&bdev->bd_mutex, 1);
+ goto out_unlock;
ret = -EBUSY;
if (part->bd_openers)
@@ -481,24 +483,21 @@ int bdev_del_partition(struct block_device *bdev, int partno)
delete_partition(part);
ret = 0;
out_unlock:
- mutex_unlock(&bdev->bd_mutex);
- mutex_unlock(&part->bd_mutex);
- bdput(part);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
return ret;
}
int bdev_resize_partition(struct block_device *bdev, int partno,
sector_t start, sector_t length)
{
- struct block_device *part;
- int ret = 0;
+ struct block_device *part = NULL;
+ int ret = -ENXIO;
- part = bdget_disk(bdev->bd_disk, partno);
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ part = xa_load(&bdev->bd_disk->part_tbl, partno);
if (!part)
- return -ENXIO;
+ goto out_unlock;
- mutex_lock(&part->bd_mutex);
- mutex_lock_nested(&bdev->bd_mutex, 1);
ret = -EINVAL;
if (start != part->bd_start_sect)
goto out_unlock;
@@ -511,9 +510,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno,
ret = 0;
out_unlock:
- mutex_unlock(&part->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(part);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
return ret;
}
@@ -538,7 +535,7 @@ void blk_drop_partitions(struct gendisk *disk)
struct block_device *part;
unsigned long idx;
- lockdep_assert_held(&disk->part0->bd_mutex);
+ lockdep_assert_held(&disk->open_mutex);
xa_for_each_start(&disk->part_tbl, idx, part, 1) {
if (!bdgrab(part))
@@ -548,7 +545,7 @@ void blk_drop_partitions(struct gendisk *disk)
}
}
-static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
+static bool blk_add_partition(struct gendisk *disk,
struct parsed_partitions *state, int p)
{
sector_t size = state->parts[p].size;
@@ -598,7 +595,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
return true;
}
-int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
+static int blk_add_partitions(struct gendisk *disk)
{
struct parsed_partitions *state;
int ret = -EAGAIN, p;
@@ -606,7 +603,7 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
if (!disk_part_scan_enabled(disk))
return 0;
- state = check_partition(disk, bdev);
+ state = check_partition(disk);
if (!state)
return 0;
if (IS_ERR(state)) {
@@ -650,7 +647,7 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
for (p = 1; p < state->limit; p++)
- if (!blk_add_partition(disk, bdev, state, p))
+ if (!blk_add_partition(disk, state, p))
goto out_free_state;
ret = 0;
@@ -659,6 +656,58 @@ out_free_state:
return ret;
}
+int bdev_disk_changed(struct gendisk *disk, bool invalidate)
+{
+ int ret = 0;
+
+ lockdep_assert_held(&disk->open_mutex);
+
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
+
+rescan:
+ if (disk->open_partitions)
+ return -EBUSY;
+ sync_blockdev(disk->part0);
+ invalidate_bdev(disk->part0);
+ blk_drop_partitions(disk);
+
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ /*
+ * Historically we only set the capacity to zero for devices that
+ * support partitions (independ of actually having partitions created).
+ * Doing that is rather inconsistent, but changing it broke legacy
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
+ * below to get the sane behavior for most device while not breaking
+ * userspace for this particular setup.
+ */
+ if (invalidate) {
+ if (disk_part_scan_enabled(disk) ||
+ !(disk->flags & GENHD_FL_REMOVABLE))
+ set_capacity(disk, 0);
+ }
+
+ if (get_capacity(disk)) {
+ ret = blk_add_partitions(disk);
+ if (ret == -EAGAIN)
+ goto rescan;
+ } else if (invalidate) {
+ /*
+ * Tell userspace that the media / partition table may have
+ * changed.
+ */
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ }
+
+ return ret;
+}
+/*
+ * Only exported for loop and dasd for historic reasons. Don't use in new
+ * code!
+ */
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
+
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
struct address_space *mapping = state->bdev->bd_inode->i_mapping;
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 8f2fcc080264..63e4f6f8b6e9 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -622,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state)
for (slot = 1; slot <= 4; slot++, p++) {
if (p->boot_ind != 0 && p->boot_ind != 0x80) {
/*
- * Even without a valid boot inidicator value
+ * Even without a valid boot indicator value
* its still possible this is valid FAT filesystem
* without a partition table.
*/