From 52abfcbd57eefdd54737fc8c2dc79d8f46d4a3e5 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 22 Oct 2020 16:58:41 -0400 Subject: blk-cgroup: Fix memleak on error path If new_blkg allocation raced with blk_policy change and blkg_lookup_check fails, new_blkg is leaked. Acked-by: Tejun Heo Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f9b55614d67d..f9389b7cf823 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -663,6 +663,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); + blkg_free(new_blkg); goto fail_unlock; } -- cgit v1.2.3 From f255c19b3ab46d3cad3b1b2e1036f4c926cb1d0c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 22 Oct 2020 16:58:42 -0400 Subject: blk-cgroup: Pre-allocate tree node on blkg_conf_prep Similarly to commit 457e490f2b741 ("blkcg: allocate struct blkcg_gq outside request queue spinlock"), blkg_create can also trigger occasional -ENOMEM failures at the radix insertion because any allocation inside blkg_create has to be non-blocking, making it more likely to fail. This causes trouble for userspace tools trying to configure io weights who need to deal with this condition. This patch reduces the occurrence of -ENOMEMs on this path by preloading the radix tree element on a GFP_KERNEL context, such that we guarantee the later non-blocking insertion won't fail. A similar solution exists in blkcg_init_queue for the same situation. Acked-by: Tejun Heo Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f9389b7cf823..c68bdf58c9a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -657,6 +657,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto fail; } + if (radix_tree_preload(GFP_KERNEL)) { + blkg_free(new_blkg); + ret = -ENOMEM; + goto fail; + } + rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -664,7 +670,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); blkg_free(new_blkg); - goto fail_unlock; + goto fail_preloaded; } if (blkg) { @@ -673,10 +679,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, blkg = blkg_create(pos, q, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); - goto fail_unlock; + goto fail_preloaded; } } + radix_tree_preload_end(); + if (pos == blkcg) goto success; } @@ -686,6 +694,8 @@ success: ctx->body = input; return 0; +fail_preloaded: + radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); -- cgit v1.2.3 From 4977d121bc9bc5138d4d48b85469123001859573 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 28 Oct 2020 16:25:36 +0900 Subject: block: advance iov_iter on bio_add_hw_page failure When the bio's size reaches max_append_sectors, bio_add_hw_page returns 0 then __bio_iov_append_get_pages returns -EINVAL. This is an expected result of building a small enough bio not to be split in the IO path. However, iov_iter is not advanced in this case, causing the same pages are filled for the bio again and again. Fix the case by properly advancing the iov_iter for already processed pages. Fixes: 0512a75b98f8 ("block: Introduce REQ_OP_ZONE_APPEND") Cc: stable@vger.kernel.org # 5.8+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: Jens Axboe --- block/bio.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index e6e26d7a1ffb..fa01bef35bb1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1044,6 +1044,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) ssize_t size, left; unsigned len, i; size_t offset; + int ret = 0; if (WARN_ON_ONCE(!max_append_sectors)) return 0; @@ -1066,15 +1067,17 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) len = min_t(size_t, PAGE_SIZE - offset, left); if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) - return -EINVAL; + max_append_sectors, &same_page) != len) { + ret = -EINVAL; + break; + } if (same_page) put_page(page); offset = 0; } - iov_iter_advance(iter, size); - return 0; + iov_iter_advance(iter, size - left); + return ret; } /** -- cgit v1.2.3 From 65ff5cd04551daf2c11c7928e48fc3483391c900 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 30 Oct 2020 10:42:53 +0800 Subject: blk-mq: mark flush request as IDLE in flush_end_io() Mark flush request as IDLE in its .end_io(), aligning it with how normal requests behave. The flush request stays in in-flight tags if we're not using an IO scheduler, so we need to change its state into IDLE. Otherwise, we will hang in blk_mq_tagset_wait_completed_request() during error recovery because flush the request state is kept as COMPLETED. Reported-by: Yi Zhang Signed-off-by: Ming Lei Tested-by: Yi Zhang Cc: Chao Leng Cc: Sagi Grimberg Signed-off-by: Jens Axboe --- block/blk-flush.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 53abb5c73d99..e32958f0b687 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -225,6 +225,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (!refcount_dec_and_test(&flush_rq->ref)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); -- cgit v1.2.3 From 7e890c37c25c7cbca37ff0ab292873d8146e713b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 12 Nov 2020 17:50:04 +0100 Subject: block: add a return value to set_capacity_revalidate_and_notify Return if the function ended up sending an uevent or not. Cc: stable@vger.kernel.org # v5.9 Signed-off-by: Christoph Hellwig Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- include/linux/genhd.h | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0a273211fec2..9387f050c248 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -49,7 +49,7 @@ static void disk_release_events(struct gendisk *disk); * Set disk capacity and notify if the size is not currently * zero and will not be set to zero */ -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev) { sector_t capacity = get_capacity(disk); @@ -62,7 +62,10 @@ void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, char *envp[] = { "RESIZE=1", NULL }; kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } + + return false; } EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 38f23d757013..03da3f603d30 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -315,7 +315,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, bool update_bdev); /* drivers/char/random.c */ -- cgit v1.2.3 From 9f16a66733c90b5f33f624b0b0e36a345b0aaf93 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 13 Nov 2020 21:44:48 +0800 Subject: block: mark flush request as IDLE when it is really finished For avoiding use-after-free on flush request, we call its .end_io() from both timeout code path and __blk_mq_end_request(). When flush request's ref doesn't drop to zero, it is still used, we can't mark it as IDLE, so fix it by marking IDLE when its refcount drops to zero really. Fixes: 65ff5cd04551 ("blk-mq: mark flush request as IDLE in flush_end_io()") Signed-off-by: Ming Lei Cc: Yi Zhang Signed-off-by: Jens Axboe --- block/blk-flush.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index e32958f0b687..fd5cee9f1a3b 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -225,13 +225,18 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (!refcount_dec_and_test(&flush_rq->ref)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return; } + /* + * Flush request has to be marked as IDLE when it is really ended + * because its .end_io() is called from timeout code path too for + * avoiding use-after-free. + */ + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); if (fq->rq_status != BLK_STS_OK) error = fq->rq_status; -- cgit v1.2.3 From b7131ee0bac5e5df73e4098e77bbddb3a31d06ff Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 19:12:46 +0100 Subject: blk-cgroup: fix a hd_struct leak in blkcg_fill_root_iostats disk_get_part needs to be paired with a disk_put_part. Cc: stable@vger.kernel.org Fixes: ef45fe470e1 ("blk-cgroup: show global disk stats in root cgroup io.stat") Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c68bdf58c9a6..54fbe1e80cc4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -849,6 +849,7 @@ static void blkcg_fill_root_iostats(void) blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end(&blkg->iostat.sync); } + disk_put_part(part); } } -- cgit v1.2.3