summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig8
-rw-r--r--block/Makefile3
-rw-r--r--block/bdev.c6
-rw-r--r--block/bfq-cgroup.c16
-rw-r--r--block/bfq-iosched.c41
-rw-r--r--block/bfq-iosched.h1
-rw-r--r--block/bfq-wf2q.c15
-rw-r--r--block/bio-integrity.c2
-rw-r--r--block/bio.c9
-rw-r--r--block/blk-cgroup.c5
-rw-r--r--block/blk-cgroup.h17
-rw-r--r--block/blk-core.c46
-rw-r--r--block/blk-crypto-internal.h12
-rw-r--r--block/blk-crypto-sysfs.c172
-rw-r--r--block/blk-crypto.c3
-rw-r--r--block/blk-iolatency.c2
-rw-r--r--block/blk-map.c2
-rw-r--r--block/blk-merge.c31
-rw-r--r--block/blk-mq-debugfs.c6
-rw-r--r--block/blk-mq-debugfs.h2
-rw-r--r--block/blk-mq-sched.c18
-rw-r--r--block/blk-mq-sysfs.c16
-rw-r--r--block/blk-mq-tag.c4
-rw-r--r--block/blk-mq.c278
-rw-r--r--block/blk-mq.h2
-rw-r--r--block/blk-rq-qos.h20
-rw-r--r--block/blk-sysfs.c42
-rw-r--r--block/blk-throttle.c48
-rw-r--r--block/blk-throttle.h3
-rw-r--r--block/blk.h2
-rw-r--r--block/elevator.c16
-rw-r--r--block/fops.c38
-rw-r--r--block/genhd.c69
33 files changed, 659 insertions, 296 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 168b873eb666..7eb5d6d53b3f 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -28,15 +28,13 @@ if BLOCK
config BLOCK_LEGACY_AUTOLOAD
bool "Legacy autoloading support"
+ default y
help
Enable loading modules and creating block device instances based on
accesses through their device special file. This is a historic Linux
feature and makes no sense in a udev world where device files are
- created on demand.
-
- Say N here unless booting or other functionality broke without it, in
- which case you should also send a report to your distribution and
- linux-block@vger.kernel.org.
+ created on demand, but scripts that manually create device nodes and
+ then call losetup might rely on this behavior.
config BLK_RQ_ALLOC_TIME
bool
diff --git a/block/Makefile b/block/Makefile
index f38eaa612929..3950ecbc5c26 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
+ blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/bdev.c b/block/bdev.c
index c68772644566..13de871fa816 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -385,7 +385,7 @@ static struct kmem_cache * bdev_cachep __read_mostly;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
- struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
+ struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
@@ -678,7 +678,7 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
if (test_bit(GD_NEED_PART_SCAN, &disk->state))
bdev_disk_changed(disk, false);
bdev->bd_openers++;
- return 0;;
+ return 0;
}
static void blkdev_put_whole(struct block_device *bdev, fmode_t mode)
@@ -738,7 +738,7 @@ struct block_device *blkdev_get_no_open(dev_t dev)
inode = ilookup(blockdev_superblock, dev);
if (inode)
pr_warn_ratelimited(
-"block device autoloading is deprecated. It will be removed in Linux 5.19\n");
+"block device autoloading is deprecated and will be removed.\n");
}
if (!inode)
return NULL;
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 24a5c5329bcd..420eda2589c0 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -645,8 +645,22 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
{
struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_group *old_parent = bfqq_group(bfqq);
/*
+ * No point to move bfqq to the same group, which can happen when
+ * root group is offlined
+ */
+ if (old_parent == bfqg)
+ return;
+
+ /*
+ * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+ * until elevator exit.
+ */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+ /*
* Get extra reference to prevent bfqq from being freed in
* next possible expire or deactivate.
*/
@@ -666,7 +680,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_and_blkg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(old_parent);
if (entity->parent &&
entity->parent->last_bfqq_created == bfqq)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0c612a911696..2e0dd68a3cbe 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -774,7 +774,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
@@ -2153,7 +2153,7 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->waker_detection_started = now_ns;
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
- bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
+ bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
@@ -2669,7 +2669,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -2782,6 +2782,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
* are likely to increase the throughput.
*/
bfqq->new_bfqq = new_bfqq;
+ /*
+ * The above assignment schedules the following redirections:
+ * each time some I/O for bfqq arrives, the process that
+ * generated that I/O is disassociated from bfqq and
+ * associated with new_bfqq. Here we increases new_bfqq->ref
+ * in advance, adding the number of processes that are
+ * expected to be associated with new_bfqq as they happen to
+ * issue I/O.
+ */
new_bfqq->ref += process_refs;
return new_bfqq;
}
@@ -2844,6 +2853,10 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ /* if a merge has already been setup, then proceed with that first */
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
/*
* Check delayed stable merge for rotational or non-queueing
* devs. For this branch to be executed, bfqq must not be
@@ -2945,9 +2958,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfq_too_late_for_merging(bfqq))
return NULL;
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
-
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
@@ -5181,7 +5191,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
- bool waiting_rq, idle_timer_disabled;
+ bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -5189,14 +5199,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
-
- idle_timer_disabled =
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ if (in_serv_queue == bfqd->in_service_queue) {
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ }
spin_unlock_irq(&bfqd->lock);
-
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
- idle_timer_disabled);
+ bfq_update_dispatch_stats(hctx->queue, rq,
+ idle_timer_disabled ? in_serv_queue : NULL,
+ idle_timer_disabled);
return rq;
}
@@ -5448,7 +5459,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
bfq_release_process_ref(bfqd, bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, true);
+ bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
bic_set_bfqq(bic, bfqq, false);
}
@@ -7018,6 +7029,8 @@ static void bfq_exit_queue(struct elevator_queue *e)
spin_unlock_irq(&bfqd->lock);
#endif
+ wbt_enable_default(bfqd->queue);
+
kfree(bfqd);
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 72255ec44f8f..3b83e3d1c2e5 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -1050,7 +1050,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
for (parent = NULL; entity ; entity = parent)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 709b901de3ca..f8eb340381cf 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -142,16 +142,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
@@ -230,11 +220,6 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- return bfqq->bfqd->root_group;
-}
-
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
{
return false;
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index bd5453220065..6996e7bd66e9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -373,7 +373,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
- bip->bip_iter.bi_sector += bytes_done >> 9;
+ bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9);
bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}
diff --git a/block/bio.c b/block/bio.c
index b15f5466ce08..33979f306e9e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -807,12 +807,6 @@ int bio_init_clone(struct block_device *bdev, struct bio *bio,
}
EXPORT_SYMBOL(bio_init_clone);
-const char *bio_devname(struct bio *bio, char *buf)
-{
- return bdevname(bio->bi_bdev, buf);
-}
-EXPORT_SYMBOL(bio_devname);
-
/**
* bio_full - check if the bio is full
* @bio: bio to check
@@ -1522,8 +1516,7 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
- rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
+ rq_qos_done_bio(bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index fa063c6c0338..d53b0d69dd73 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -82,6 +82,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+ if (blkg->q)
+ blk_put_queue(blkg->q);
free_percpu(blkg->iostat_cpu);
percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
@@ -167,6 +169,9 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg->iostat_cpu)
goto err_free;
+ if (!blk_get_queue(q))
+ goto err_free;
+
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index 3e91803c4a55..47e1e38390c9 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -15,6 +15,7 @@
*/
#include <linux/blk-cgroup.h>
+#include <linux/blk-mq.h>
/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
@@ -428,6 +429,21 @@ static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
}
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * @bio and @rq should belong to the same cgroup and their issue_as_root should
+ * match. The latter is necessary as we don't want to throttle e.g. a metadata
+ * update because it happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+ return rq->bio->bi_blkg == bio->bi_blkg &&
+ bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else /* CONFIG_BLK_CGROUP */
@@ -467,6 +483,7 @@ static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
#define blk_queue_for_each_rl(rl, q) \
for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
diff --git a/block/blk-core.c b/block/blk-core.c
index ce08f0aa9dfc..d4ae6ac53ffc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -50,6 +50,7 @@
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
+#include "blk-rq-qos.h"
struct dentry *blk_debugfs_root;
@@ -285,13 +286,6 @@ void blk_queue_start_drain(struct request_queue *q)
wake_up_all(&q->mq_freeze_wq);
}
-void blk_set_queue_dying(struct request_queue *q)
-{
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
- blk_queue_start_drain(q);
-}
-EXPORT_SYMBOL_GPL(blk_set_queue_dying);
-
/**
* blk_cleanup_queue - shutdown a request queue
* @q: request queue to shutdown
@@ -309,7 +303,8 @@ void blk_cleanup_queue(struct request_queue *q)
WARN_ON_ONCE(blk_queue_registered(q));
/* mark @q DYING, no new request or merges will be allowed afterwards */
- blk_set_queue_dying(q);
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+ blk_queue_start_drain(q);
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
@@ -321,6 +316,9 @@ void blk_cleanup_queue(struct request_queue *q)
*/
blk_freeze_queue(q);
+ /* cleanup rq qos structures for queue without disk */
+ rq_qos_exit(q);
+
blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
blk_sync_queue(q);
@@ -342,8 +340,6 @@ void blk_cleanup_queue(struct request_queue *q)
blk_mq_sched_free_rqs(q);
mutex_unlock(&q->sysfs_lock);
- percpu_ref_exit(&q->q_usage_counter);
-
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
@@ -496,17 +492,12 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
goto fail_stats;
- if (blkcg_init_queue(q))
- goto fail_ref;
-
blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
-fail_ref:
- percpu_ref_exit(&q->q_usage_counter);
fail_stats:
blk_free_queue_stats(q->stats);
fail_split:
@@ -540,17 +531,6 @@ bool blk_get_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_get_queue);
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
- char b[BDEVNAME_SIZE];
-
- pr_info_ratelimited("%s: attempt to access beyond end of device\n"
- "%s: rw=%d, want=%llu, limit=%llu\n",
- current->comm,
- bio_devname(bio, b), bio->bi_opf,
- bio_end_sector(bio), maxsector);
-}
-
#ifdef CONFIG_FAIL_MAKE_REQUEST
static DECLARE_FAULT_ATTR(fail_make_request);
@@ -580,14 +560,10 @@ late_initcall(fail_make_request_debugfs);
static inline bool bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
- char b[BDEVNAME_SIZE];
-
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
-
- WARN_ONCE(1,
- "Trying to write to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), bio->bi_bdev->bd_partno);
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
/* Older lvm-tools actually trigger this */
return false;
}
@@ -616,7 +592,11 @@ static inline int bio_check_eod(struct bio *bio)
if (nr_sectors && maxsector &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
- handle_bad_sector(bio, maxsector);
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+ "%pg: rw=%d, want=%llu, limit=%llu\n",
+ current->comm,
+ bio->bi_bdev, bio->bi_opf,
+ bio_end_sector(bio), maxsector);
return -EIO;
}
return 0;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index 2fb0d65a464c..e6818ffaddbf 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -11,6 +11,7 @@
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
+ const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+int blk_crypto_sysfs_register(struct request_queue *q);
+
+void blk_crypto_sysfs_unregister(struct request_queue *q);
+
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -62,6 +67,13 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+static inline int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
{
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..fd93bd2f33b7
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto. This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+ struct kobject kobj;
+ struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+ return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+ return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+ &max_dun_bits_attr.attr,
+ &num_keyslots_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+ .attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes. To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+ int mode_num = a - __blk_crypto_mode_attrs;
+
+ if (profile->modes_supported[mode_num])
+ return 0444;
+ return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ int mode_num = attr - __blk_crypto_mode_attrs;
+
+ return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+ .name = "modes",
+ .attrs = blk_crypto_mode_attrs,
+ .is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+ &blk_crypto_attr_group,
+ &blk_crypto_modes_attr_group,
+ NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+ .show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static struct kobj_type blk_crypto_ktype = {
+ .default_groups = blk_crypto_attr_groups,
+ .sysfs_ops = &blk_crypto_attr_ops,
+ .release = blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct request_queue *q)
+{
+ struct blk_crypto_kobj *obj;
+ int err;
+
+ if (!q->crypto_profile)
+ return 0;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ obj->profile = q->crypto_profile;
+
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
+ "crypto");
+ if (err) {
+ kobject_put(&obj->kobj);
+ return err;
+ }
+ q->crypto_kobject = &obj->kobj;
+ return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct request_queue *q)
+{
+ kobject_put(q->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+ for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+ attr->attr.name = blk_crypto_modes[i].name;
+ attr->attr.mode = 0444;
+ attr->show = blk_crypto_mode_show;
+ blk_crypto_mode_attrs[i - 1] = &attr->attr;
+ }
+ return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 773dae4c329b..a496aaef85ba 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -19,16 +19,19 @@
const struct blk_crypto_mode blk_crypto_modes[] = {
[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 010e658d44a8..2f33932e72e3 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -598,7 +598,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+ if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
diff --git a/block/blk-map.c b/block/blk-map.c
index 4526adde0156..c7f71d83eff1 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -446,7 +446,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (bytes > len)
bytes = len;
- page = alloc_page(GFP_NOIO | gfp_mask);
+ page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask);
if (!page)
goto cleanup;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f5255991b773..ea6968313b4a 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,6 +9,7 @@
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
@@ -598,6 +599,9 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
+ if (!blk_cgroup_mergeable(req, bio))
+ goto no_merge;
+
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
@@ -694,6 +698,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (total_phys_segments > blk_rq_get_max_segments(req))
return 0;
+ if (!blk_cgroup_mergeable(req, next->bio))
+ return 0;
+
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
@@ -902,6 +909,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
+ /* don't merge across cgroup boundaries */
+ if (!blk_cgroup_mergeable(rq, bio))
+ return false;
+
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
@@ -1087,12 +1098,20 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
if (!plug || rq_list_empty(plug->mq_list))
return false;
- /* check the previously added entry for a quick merge attempt */
- rq = rq_list_peek(&plug->mq_list);
- if (rq->q == q) {
- if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
- BIO_MERGE_OK)
- return true;
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q == q) {
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
+ }
+
+ /*
+ * Only keep iterating plug list for merges if we have multiple
+ * queues
+ */
+ if (!plug->multiple_queues)
+ break;
}
return false;
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a790eb4995c..e2880f6deb34 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -707,7 +707,7 @@ static void debugfs_create_files(struct dentry *parent, void *data,
void blk_mq_debugfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
@@ -780,7 +780,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_debugfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_register_hctx(q, hctx);
@@ -789,7 +789,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_unregister_hctx(hctx);
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a68aa6041a10..69918f4170d6 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
+struct blk_mq_hw_ctx;
+
struct blk_mq_debugfs_attr {
const char *name;
umode_t mode;
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 55488ba97823..9e56a69422b6 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -180,11 +180,18 @@ static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
+ unsigned long end = jiffies + HZ;
int ret;
do {
ret = __blk_mq_do_dispatch_sched(hctx);
- } while (ret == 1);
+ if (ret != 1)
+ break;
+ if (need_resched() || time_is_before_jiffies(end)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ break;
+ }
+ } while (1);
return ret;
}
@@ -515,7 +522,7 @@ static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
@@ -550,9 +557,10 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
- unsigned int i, flags = q->tag_set->flags;
+ unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
+ unsigned long i;
int ret;
if (!e) {
@@ -618,7 +626,7 @@ err_free_map_and_rqs:
void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
@@ -635,7 +643,7 @@ void blk_mq_sched_free_rqs(struct request_queue *q)
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 674786574075..c08426975856 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -206,7 +206,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -255,7 +255,8 @@ void blk_mq_sysfs_init(struct request_queue *q)
int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int ret, i;
+ unsigned long i, j;
+ int ret;
WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
@@ -278,8 +279,10 @@ out:
return ret;
unreg:
- while (--i >= 0)
- blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+ queue_for_each_hw_ctx(q, hctx, j) {
+ if (j < i)
+ blk_mq_unregister_hctx(hctx);
+ }
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
@@ -290,7 +293,7 @@ unreg:
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
@@ -306,7 +309,8 @@ unlock:
int blk_mq_sysfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 0fd409b8e86e..68ac23d0b640 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -498,7 +498,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
/*
- * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+ * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
* racing with it.
*/
@@ -515,7 +515,7 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
bt_for_each(NULL, q, btags, fn, priv, false);
} else {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a05ce7725031..8e659dc5fcf3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq)
static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q,
blk_qc_t qc)
{
- return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT];
+ return xa_load(&q->hctx_table,
+ (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT);
}
static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx,
@@ -312,7 +313,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
@@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = q->queue_hw_ctx[hctx_idx];
+ data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
@@ -736,6 +737,10 @@ static void blk_complete_request(struct request *req)
/* Completion has already been traced */
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ bio->bi_iter.bi_sector = req->__sector;
+
if (!is_flush)
bio_endio(bio);
bio = next;
@@ -883,10 +888,15 @@ static inline void blk_account_io_done(struct request *req, u64 now)
static void __blk_account_io_start(struct request *rq)
{
- /* passthrough requests can hold bios that do not have ->bi_bdev set */
- if (rq->bio && rq->bio->bi_bdev)
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (rq->bio)
rq->part = rq->bio->bi_bdev;
- else if (rq->q->disk)
+ else
rq->part = rq->q->disk->part0;
part_stat_lock();
@@ -1442,7 +1452,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
container_of(work, struct request_queue, timeout_work);
unsigned long next = 0;
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
@@ -2143,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2171,7 +2181,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
struct blk_mq_hw_ctx *hctx, *sq_hctx;
- int i;
+ unsigned long i;
sq_hctx = NULL;
if (blk_mq_has_sqsched(q))
@@ -2209,7 +2219,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
bool blk_mq_queue_stopped(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hctx_stopped(hctx))
@@ -2248,7 +2258,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue);
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
@@ -2266,7 +2276,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
@@ -2286,7 +2296,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_stopped_hw_queue(hctx, async);
@@ -2567,13 +2577,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q,
q->mq_ops->queue_rqs(&plug->mq_list);
}
+static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+{
+ struct blk_mq_hw_ctx *this_hctx = NULL;
+ struct blk_mq_ctx *this_ctx = NULL;
+ struct request *requeue_list = NULL;
+ unsigned int depth = 0;
+ LIST_HEAD(list);
+
+ do {
+ struct request *rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
+ rq_list_add(&requeue_list, rq);
+ continue;
+ }
+ list_add_tail(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ plug->mq_list = requeue_list;
+ trace_block_unplug(this_hctx->queue, depth, !from_sched);
+ blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched);
+}
+
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct blk_mq_hw_ctx *this_hctx;
- struct blk_mq_ctx *this_ctx;
struct request *rq;
- unsigned int depth;
- LIST_HEAD(list);
if (rq_list_empty(plug->mq_list))
return;
@@ -2609,35 +2642,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
return;
}
- this_hctx = NULL;
- this_ctx = NULL;
- depth = 0;
do {
- rq = rq_list_pop(&plug->mq_list);
-
- if (!this_hctx) {
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
- } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) {
- trace_block_unplug(this_hctx->queue, depth,
- !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx,
- &list, from_schedule);
- depth = 0;
- this_hctx = rq->mq_hctx;
- this_ctx = rq->mq_ctx;
-
- }
-
- list_add(&rq->queuelist, &list);
- depth++;
+ blk_mq_dispatch_plug_list(plug, from_schedule);
} while (!rq_list_empty(plug->mq_list));
-
- if (!list_empty(&list)) {
- trace_block_unplug(this_hctx->queue, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &list,
- from_schedule);
- }
}
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
@@ -2724,7 +2731,8 @@ static bool blk_mq_attempt_bio_merge(struct request_queue *q,
static struct request *blk_mq_get_new_requests(struct request_queue *q,
struct blk_plug *plug,
- struct bio *bio)
+ struct bio *bio,
+ unsigned int nsegs)
{
struct blk_mq_alloc_data data = {
.q = q,
@@ -2736,6 +2744,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
if (unlikely(bio_queue_enter(bio)))
return NULL;
+ if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+ goto queue_exit;
+
+ rq_qos_throttle(q, bio);
+
if (plug) {
data.nr_tags = plug->nr_ios;
plug->nr_ios = 1;
@@ -2748,12 +2761,13 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
+queue_exit:
blk_queue_exit(q);
return NULL;
}
static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
- struct blk_plug *plug, struct bio *bio)
+ struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
{
struct request *rq;
@@ -2763,12 +2777,19 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
if (!rq || rq->q != q)
return NULL;
- if (blk_mq_get_hctx_type(bio->bi_opf) != rq->mq_hctx->type)
+ if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
+ *bio = NULL;
return NULL;
- if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+ }
+
+ rq_qos_throttle(q, *bio);
+
+ if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
return NULL;
- rq->cmd_flags = bio->bi_opf;
+ rq->cmd_flags = (*bio)->bi_opf;
plug->cached_rq = rq_list_next(rq);
INIT_LIST_HEAD(&rq->queuelist);
return rq;
@@ -2803,14 +2824,11 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
return;
- if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
- return;
-
- rq_qos_throttle(q, bio);
-
- rq = blk_mq_get_cached_request(q, plug, bio);
+ rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
if (!rq) {
- rq = blk_mq_get_new_requests(q, plug, bio);
+ if (!bio)
+ return;
+ rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
return;
}
@@ -3065,6 +3083,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
struct blk_mq_tags *drv_tags;
struct page *page;
+ if (list_empty(&tags->page_list))
+ return;
+
if (blk_mq_is_shared_tags(set->flags))
drv_tags = set->shared_tags;
else
@@ -3107,15 +3128,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
blk_mq_free_tags(tags);
}
+static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ int i;
+
+ for (i = 0; i < set->nr_maps; i++) {
+ unsigned int start = set->map[i].queue_offset;
+ unsigned int end = start + set->map[i].nr_queues;
+
+ if (hctx_idx >= start && hctx_idx < end)
+ break;
+ }
+
+ if (i >= set->nr_maps)
+ i = HCTX_TYPE_DEFAULT;
+
+ return i;
+}
+
+static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
+
+ return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
+}
+
static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags)
{
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
struct blk_mq_tags *tags;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3164,10 +3211,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
size_t rq_size, left;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -3412,6 +3458,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
blk_mq_remove_cpuhp(hctx);
+ xa_erase(&q->hctx_table, hctx_idx);
+
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
@@ -3421,12 +3469,11 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
- blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
@@ -3451,8 +3498,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
+
+ if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
+ goto exit_flush_rq;
+
return 0;
+ exit_flush_rq:
+ if (set->ops->exit_request)
+ set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -3612,7 +3666,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
static void blk_mq_map_swqueue(struct request_queue *q)
{
- unsigned int i, j, hctx_idx;
+ unsigned int j, hctx_idx;
+ unsigned long i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -3719,7 +3774,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (shared) {
@@ -3819,7 +3874,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -3830,7 +3885,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -3919,52 +3974,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- int i, j, end;
- struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
-
- if (q->nr_hw_queues < set->nr_hw_queues) {
- struct blk_mq_hw_ctx **new_hctxs;
-
- new_hctxs = kcalloc_node(set->nr_hw_queues,
- sizeof(*new_hctxs), GFP_KERNEL,
- set->numa_node);
- if (!new_hctxs)
- return;
- if (hctxs)
- memcpy(new_hctxs, hctxs, q->nr_hw_queues *
- sizeof(*hctxs));
- q->queue_hw_ctx = new_hctxs;
- kfree(hctxs);
- hctxs = new_hctxs;
- }
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i, j;
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
- int node;
- struct blk_mq_hw_ctx *hctx;
+ int old_node;
+ int node = blk_mq_get_hctx_node(set, i);
+ struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
- /*
- * If the hw queue has been mapped to another numa node,
- * we need to realloc the hctx. If allocation fails, fallback
- * to use the previous one.
- */
- if (hctxs[i] && (hctxs[i]->numa_node == node))
- continue;
+ if (old_hctx) {
+ old_node = old_hctx->numa_node;
+ blk_mq_exit_hctx(q, set, old_hctx, i);
+ }
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
- if (hctx) {
- if (hctxs[i])
- blk_mq_exit_hctx(q, set, hctxs[i], i);
- hctxs[i] = hctx;
- } else {
- if (hctxs[i])
- pr_warn("Allocate new hctx on node %d fails,\
- fallback to previous one on node %d\n",
- node, hctxs[i]->numa_node);
- else
+ if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ if (!old_hctx)
break;
+ pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
+ node, old_node);
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
+ WARN_ON_ONCE(!hctx);
}
}
/*
@@ -3973,24 +4004,27 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
- end = i;
} else {
j = i;
- end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- for (; j < end; j++) {
- struct blk_mq_hw_ctx *hctx = hctxs[j];
-
- if (hctx) {
- blk_mq_exit_hctx(q, set, hctx, j);
- hctxs[j] = NULL;
- }
- }
+ xa_for_each_start(&q->hctx_table, j, hctx, j)
+ blk_mq_exit_hctx(q, set, hctx, j);
mutex_unlock(&q->sysfs_lock);
}
+static void blk_mq_update_poll_flag(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (set->nr_maps > HCTX_TYPE_POLL &&
+ set->map[HCTX_TYPE_POLL].nr_queues)
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+}
+
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
@@ -4015,6 +4049,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
+ xa_init(&q->hctx_table);
+
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -4025,9 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
INIT_LIST_HEAD(&q->requeue_list);
@@ -4046,7 +4080,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
return 0;
err_hctxs:
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
@@ -4334,7 +4368,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
- int i, ret;
+ int ret;
+ unsigned long i;
if (!set)
return -EINVAL;
@@ -4493,6 +4528,7 @@ fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
+ blk_mq_update_poll_flag(q);
if (q->nr_hw_queues != set->nr_hw_queues) {
int i = prev_nr_hw_queues;
@@ -4709,7 +4745,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q)
{
if (queue_is_mq(q)) {
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
cancel_delayed_work_sync(&q->requeue_work);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 948791ea2a3e..2615bd58bad3 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -83,7 +83,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
enum hctx_type type,
unsigned int cpu)
{
- return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
+ return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..68267007da1c 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -177,20 +177,20 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
__rq_qos_requeue(q->rq_qos, rq);
}
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
{
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
+ if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ bio_flagged(bio, BIO_QOS_MERGED))) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
+ }
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- /*
- * BIO_TRACKED lets controllers know that a bio went through the
- * normal rq_qos path.
- */
if (q->rq_qos) {
- bio_set_flag(bio, BIO_TRACKED);
+ bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
@@ -205,8 +205,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
+ }
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4c6b7dff71e5..85c4ba006671 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -739,27 +739,6 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- elevator_exit(q);
- }
-
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
-}
-
/**
* blk_release_queue - releases all allocated resources of the request_queue
* @kobj: pointer to a kobject, whose container is a request_queue
@@ -787,12 +766,12 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep();
+ percpu_ref_exit(&q->q_usage_counter);
+
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
- blk_exit_queue(q);
-
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
@@ -880,6 +859,10 @@ int blk_register_queue(struct gendisk *disk)
goto put_dev;
}
+ ret = blk_crypto_sysfs_register(q);
+ if (ret)
+ goto put_dev;
+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
@@ -910,6 +893,7 @@ unlock:
return ret;
put_dev:
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
@@ -954,16 +938,18 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ blk_crypto_sysfs_unregister(q);
blk_trace_remove_sysfs(disk_to_dev(disk));
mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- elv_unregister_queue(q);
+ elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
+
+ /* Now that we've deleted all child objects, we can delete the queue. */
+ kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ kobject_del(&q->kobj);
+
mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a3b3ebc72dd4..469c483719be 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -874,7 +874,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
+ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
+ tg->flags & THROTL_TG_CANCELING) {
if (wait)
*wait = 0;
return true;
@@ -1137,12 +1138,22 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
- struct request_queue *q = td->queue;
struct throtl_service_queue *parent_sq;
+ struct request_queue *q;
bool dispatched;
int ret;
+ /* throtl_data may be gone, so figure out request queue by blkg */
+ if (tg)
+ q = tg->pd.blkg->q;
+ else
+ q = td->queue;
+
spin_lock_irq(&q->queue_lock);
+
+ if (!q->root_blkg)
+ goto out_unlock;
+
if (throtl_can_upgrade(td, NULL))
throtl_upgrade_state(td);
@@ -1766,6 +1777,39 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
return false;
}
+void blk_throtl_cancel_bios(struct request_queue *q)
+{
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ spin_lock_irq(&q->queue_lock);
+ /*
+ * queue_lock is held, rcu lock is not needed here technically.
+ * However, rcu lock is still held to emphasize that following
+ * path need RCU protection and to prevent warning from lockdep.
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&q->queue_lock);
+}
+
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index b23a9f3abb82..c1b602996127 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -56,6 +56,7 @@ enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
+ THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
};
enum {
@@ -162,11 +163,13 @@ static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
bool __blk_throtl_bio(struct bio *bio);
+void blk_throtl_cancel_bios(struct request_queue *q);
static inline bool blk_throtl_bio(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
diff --git a/block/blk.h b/block/blk.h
index ebaa59ca46ca..6f21859c7f0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -325,7 +325,7 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
+ return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
diff --git a/block/elevator.c b/block/elevator.c
index 6847ab6e7aa5..c319765892bb 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -192,6 +192,9 @@ void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
+ ioc_clear_queue(q);
+ blk_mq_sched_free_rqs(q);
+
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
@@ -516,17 +519,17 @@ int elv_register_queue(struct request_queue *q, bool uevent)
void elv_unregister_queue(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
lockdep_assert_held(&q->sysfs_lock);
- if (q) {
+ if (e && e->registered) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
e->registered = 0;
- /* Re-enable throttling in case elevator disabled it */
- wbt_enable_default(q);
}
}
@@ -593,11 +596,7 @@ int elevator_switch_mq(struct request_queue *q,
lockdep_assert_held(&q->sysfs_lock);
if (q->elevator) {
- if (q->elevator->registered)
- elv_unregister_queue(q);
-
- ioc_clear_queue(q);
- blk_mq_sched_free_rqs(q);
+ elv_unregister_queue(q);
elevator_exit(q);
}
@@ -608,7 +607,6 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
- blk_mq_sched_free_rqs(q);
elevator_exit(q);
goto out;
}
diff --git a/block/fops.c b/block/fops.c
index 3696665e586a..e49096354dcd 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -287,6 +287,8 @@ static void blkdev_bio_end_io_async(struct bio *bio)
struct kiocb *iocb = dio->iocb;
ssize_t ret;
+ WRITE_ONCE(iocb->private, NULL);
+
if (likely(!bio->bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
@@ -426,7 +428,8 @@ static int blkdev_writepages(struct address_space *mapping,
}
const struct address_space_operations def_blk_aops = {
- .set_page_dirty = __set_page_dirty_buffers,
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
.readpage = blkdev_readpage,
.readahead = blkdev_readahead,
.writepage = blkdev_writepage,
@@ -563,34 +566,37 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct block_device *bdev = iocb->ki_filp->private_data;
loff_t size = bdev_nr_bytes(bdev);
- size_t count = iov_iter_count(to);
loff_t pos = iocb->ki_pos;
size_t shorted = 0;
ssize_t ret = 0;
+ size_t count;
- if (unlikely(pos + count > size)) {
+ if (unlikely(pos + iov_iter_count(to) > size)) {
if (pos >= size)
return 0;
size -= pos;
- if (count > size) {
- shorted = count - size;
- iov_iter_truncate(to, size);
- }
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
}
+ count = iov_iter_count(to);
+ if (!count)
+ goto reexpand; /* skip atime */
+
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = iocb->ki_filp->f_mapping;
if (iocb->ki_flags & IOCB_NOWAIT) {
- if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
- iocb->ki_pos + count - 1))
- return -EAGAIN;
+ if (filemap_range_needs_writeback(mapping, pos,
+ pos + count - 1)) {
+ ret = -EAGAIN;
+ goto reexpand;
+ }
} else {
- ret = filemap_write_and_wait_range(mapping,
- iocb->ki_pos,
- iocb->ki_pos + count - 1);
+ ret = filemap_write_and_wait_range(mapping, pos,
+ pos + count - 1);
if (ret < 0)
- return ret;
+ goto reexpand;
}
file_accessed(iocb->ki_filp);
@@ -600,12 +606,14 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_pos += ret;
count -= ret;
}
+ iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
- return ret;
+ goto reexpand;
}
ret = filemap_read(iocb, to, ret);
+reexpand:
if (unlikely(shorted))
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
return ret;
diff --git a/block/genhd.c b/block/genhd.c
index 1ed46a6f94f5..c9a4fc90d3e9 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,10 +25,12 @@
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
+#include "blk-throttle.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
static struct kobject *block_depr;
@@ -557,6 +559,20 @@ out_free_ext_minor:
EXPORT_SYMBOL(device_add_disk);
/**
+ * blk_mark_disk_dead - mark a disk as dead
+ * @disk: disk to mark as dead
+ *
+ * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
+ * to this disk.
+ */
+void blk_mark_disk_dead(struct gendisk *disk)
+{
+ set_bit(GD_DEAD, &disk->state);
+ blk_queue_start_drain(disk->queue);
+}
+EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
+
+/**
* del_gendisk - remove the gendisk
* @disk: the struct gendisk to remove
*
@@ -630,7 +646,8 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q);
- rq_qos_exit(q);
+ blk_throtl_cancel_bios(disk->queue);
+
blk_sync_queue(q);
blk_flush_integrity();
/*
@@ -923,12 +940,17 @@ ssize_t part_stat_show(struct device *dev,
struct disk_stats stat;
unsigned int inflight;
- part_stat_read_all(bdev, &stat);
if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, bdev);
else
inflight = part_in_flight(bdev);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(bdev, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(bdev, &stat);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -1096,6 +1118,31 @@ static const struct attribute_group *disk_attr_groups[] = {
NULL
};
+static void disk_release_mq(struct request_queue *q)
+{
+ blk_mq_cancel_work_sync(q);
+
+ /*
+ * There can't be any non non-passthrough bios in flight here, but
+ * requests stay around longer, including passthrough ones so we
+ * still need to freeze the queue here.
+ */
+ blk_mq_freeze_queue(q);
+
+ /*
+ * Since the I/O scheduler exit code may access cgroup information,
+ * perform I/O scheduler exit before disassociating from the block
+ * cgroup controller.
+ */
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ __blk_mq_unfreeze_queue(q, true);
+}
+
/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
@@ -1117,11 +1164,15 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
- blk_mq_cancel_work_sync(disk->queue);
+ if (queue_is_mq(disk->queue))
+ disk_release_mq(disk->queue);
+
+ blkcg_exit_queue(disk->queue);
disk_release_events(disk);
kfree(disk->random);
xa_destroy(&disk->part_tbl);
+
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
@@ -1188,12 +1239,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
xa_for_each(&gp->part_tbl, idx, hd) {
if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
continue;
- part_stat_read_all(hd, &stat);
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
inflight = part_in_flight(hd);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(hd, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(hd, &stat);
seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
@@ -1322,6 +1378,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
+ if (blkcg_init_queue(q))
+ goto out_erase_part0;
+
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
@@ -1334,6 +1393,8 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
#endif
return disk;
+out_erase_part0:
+ xa_erase(&disk->part_tbl, 0);
out_destroy_part_tbl:
xa_destroy(&disk->part_tbl);
disk->part0->bd_disk = NULL;