From 3f89ac587baa0c0460c977d1596e16f950815f05 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 24 Apr 2023 16:46:28 -0700 Subject: block/drivers: remove dead clear of random flag QUEUE_FLAG_ADD_RANDOM is not set before we clear it for "null_blk", "brd", "nbd", "zram", and "bcache" since by default we don't set "QUEUE_FLAG_ADD_RANDOM" to MQ ops. Remove dead clear of QUEUE_FLAG_ADD_RANDOM in above listed drivers. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Sergey Senozhatsky #zram Link: https://lore.kernel.org/r/20230424234628.45544-2-kch@nvidia.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 - drivers/block/nbd.c | 1 - drivers/block/null_blk/main.c | 1 - drivers/block/zram/zram_drv.c | 1 - drivers/md/bcache/super.c | 1 - 5 files changed, 5 deletions(-) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 34177f1bd97d..bcad9b926b0c 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -404,7 +404,6 @@ static int brd_alloc(int i) /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue); err = add_disk(disk); if (err) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index d445fd0934bd..7c96ec4e99df 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1805,7 +1805,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) * Tell the block layer that we are not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue); disk->queue->limits.discard_granularity = 0; blk_queue_max_discard_sectors(disk->queue, 0); blk_queue_max_segment_size(disk->queue, UINT_MAX); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index fcb61f1d5437..a60671a42de9 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2145,7 +2145,6 @@ static int null_add_dev(struct nullb_device *dev) nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index aa490da3cef2..f7d4c0d5ad0d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2323,7 +2323,6 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); /* * To ensure that we always get PAGE_SIZE aligned diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index ba3909bb6bea..7e9d19fd21dd 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -971,7 +971,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, } blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue); blk_queue_write_cache(q, true, true); -- cgit v1.2.3 From bd9e9916c32fd4b4fb4e879e05bd1568ee02ec93 Mon Sep 17 00:00:00 2001 From: Eric Blake Date: Mon, 10 Apr 2023 13:06:10 -0500 Subject: block nbd: use req.cookie instead of req.handle The NBD spec was recently changed [1] to refer to the opaque client identifier as a 'cookie' rather than a 'handle', but has for a much longer time listed it as a 64-bit value, and declares that all values in the NBD protocol are sent in network byte order (big-endian). Because the value is opaque to the server, it doesn't usually matter what endianness we send as the client - as long as we are consistent that either we byte-swap on both write and read, or on neither, then we can match server replies back to our requests. That said, our internal use of the cookie is as a 64-bit number (well, as two 32-bit numbers concatenated together), rather than as 8 individual bytes; so prior to this commit, we ARE leaking the native endianness of our internals as a client out to the server. We don't know of any server that will actually inspect the opaque value and behave differently depending on whether a little-endian or big-endian client is sending requests, but since we DO log the cookie value, a wireshark capture of the network traffic is easier to correlate back to the kernel traffic of a big-endian host (where the u64 and char[8] representations are the same) than of a little-endian host (where if wireshark honors the NBD spec and displays a u64 in network byte order, it is byte-swapped from what the kernel logged). The fix in this patch is thus two-part: it now consistently uses network byte order for the opaque value (no difference to a big-endian machine, but an extra byteswap on a little-endian machine; probably in the noise compared to the overhead of network traffic in general), and now uses a 64-bit integer instead of char[8] as its preferred access to the opaque value (direct assignment instead of memcpy()). Signed-off-by: Eric Blake Reviewed-by: Josef Bacik Link: https://lore.kernel.org/r/20230410180611.1051618-4-eblake@redhat.com Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 7c96ec4e99df..9c35c958f2c8 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -609,7 +609,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) request.len = htonl(size); } handle = nbd_cmd_handle(cmd); - memcpy(request.handle, &handle, sizeof(handle)); + request.cookie = cpu_to_be64(handle); trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd)); @@ -621,7 +621,7 @@ static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index) trace_nbd_header_sent(req, handle); if (result < 0) { if (was_interrupted(result)) { - /* If we havne't sent anything we can just return BUSY, + /* If we haven't sent anything we can just return BUSY, * however if we have sent something we need to make * sure we only allow this req to be sent until we are * completely done. @@ -735,7 +735,7 @@ static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index, u32 tag; int ret = 0; - memcpy(&handle, reply->handle, sizeof(handle)); + handle = be64_to_cpu(reply->cookie); tag = nbd_handle_to_tag(handle); hwq = blk_mq_unique_tag_to_hwq(tag); if (hwq < nbd->tag_set.nr_hw_queues) -- cgit v1.2.3 From fc05e06e6098ca2c28f7a10da0e00aeea20fa59e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Apr 2023 19:15:37 +0200 Subject: md/raid5: Improve performance for sequential IO Commit 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") changed the order in which requests for underlying disks are created. Since for large sequential IO adding of requests frequently races with md_raid5 thread submitting bios to underlying disks, this results in a change in IO pattern because intermediate states of new order of request creation result in more smaller discontiguous requests. For RAID5 on top of three rotational disks our performance testing revealed this results in regression in write throughput: iozone -a -s 131072000 -y 4 -q 8 -i 0 -i 1 -R before 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 493670 525964 524575 513384 131072000 8 540467 532880 512028 513703 after 7e55c60acfbb: KB reclen write rewrite read reread 131072000 4 421785 456184 531278 509248 131072000 8 459283 456354 528449 543834 To reduce the amount of discontiguous requests we can start generating requests with the stripe with the lowest chunk offset as that has the best chance of being adjacent to IO queued previously. This improves the performance to: KB reclen write rewrite read reread 131072000 4 497682 506317 518043 514559 131072000 8 514048 501886 506453 504319 restoring big part of the regression. Fixes: 7e55c60acfbb ("md/raid5: Pivot raid5_make_request()") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Jan Kara Reviewed-by: Logan Gunthorpe Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230417171537.17899-1-jack@suse.cz --- drivers/md/raid5.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 812a12e3e41a..4739ed891e75 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6079,6 +6079,38 @@ out_release: return ret; } +/* + * If the bio covers multiple data disks, find sector within the bio that has + * the lowest chunk offset in the first chunk. + */ +static sector_t raid5_bio_lowest_chunk_sector(struct r5conf *conf, + struct bio *bi) +{ + int sectors_per_chunk = conf->chunk_sectors; + int raid_disks = conf->raid_disks; + int dd_idx; + struct stripe_head sh; + unsigned int chunk_offset; + sector_t r_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); + sector_t sector; + + /* We pass in fake stripe_head to get back parity disk numbers */ + sector = raid5_compute_sector(conf, r_sector, 0, &dd_idx, &sh); + chunk_offset = sector_div(sector, sectors_per_chunk); + if (sectors_per_chunk - chunk_offset >= bio_sectors(bi)) + return r_sector; + /* + * Bio crosses to the next data disk. Check whether it's in the same + * chunk. + */ + dd_idx++; + while (dd_idx == sh.pd_idx || dd_idx == sh.qd_idx) + dd_idx++; + if (dd_idx >= raid_disks) + return r_sector; + return r_sector + sectors_per_chunk - chunk_offset; +} + static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); @@ -6150,6 +6182,17 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) } md_account_bio(mddev, &bi); + /* + * Lets start with the stripe with the lowest chunk offset in the first + * chunk. That has the best chances of creating IOs adjacent to + * previous IOs in case of sequential IO and thus creates the most + * sequential IO pattern. We don't bother with the optimization when + * reshaping as the performance benefit is not worth the complexity. + */ + if (likely(conf->reshape_progress == MaxSector)) + logical_sector = raid5_bio_lowest_chunk_sector(conf, bi); + s = (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + add_wait_queue(&conf->wait_for_overlap, &wait); while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, @@ -6178,7 +6221,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) continue; } - s = find_first_bit(ctx.sectors_to_do, stripe_cnt); + s = find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); if (s == stripe_cnt) break; -- cgit v1.2.3 From b1211978ecf19bceb63a04f53fea4b5d73832a4a Mon Sep 17 00:00:00 2001 From: Jonathan Derrick Date: Mon, 24 Apr 2023 19:14:38 -0600 Subject: md: Fix bitmap offset type in sb writer Bitmap offset is allowed to be negative, indicating that bitmap precedes metadata. Change the type back from sector_t to loff_t to satisfy conditionals and calculations. Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-raid/CAPhsuW6HuaUJ5WcyPajVgUfkQFYp2D_cy1g6qxN4CU_gP2=z7g@mail.gmail.com/ Fixes: 10172f200b67 ("md: Fix types in sb writer") Signed-off-by: Jonathan Derrick Suggested-by: Yu Kuai Reviewed-by: Yu Kuai Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20230425011438.71046-1-jonathan.derrick@linux.dev --- drivers/md/md-bitmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 920bb68156d2..bc8d7565171d 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -219,7 +219,7 @@ static unsigned int optimal_io_size(struct block_device *bdev, } static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size, - sector_t start, sector_t boundary) + loff_t start, loff_t boundary) { if (io_size != opt_size && start + opt_size / SECTOR_SIZE <= boundary) @@ -237,8 +237,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, struct block_device *bdev; struct mddev *mddev = bitmap->mddev; struct bitmap_storage *store = &bitmap->storage; - sector_t offset = mddev->bitmap_info.offset; - sector_t ps, sboff, doff; + loff_t sboff, offset = mddev->bitmap_info.offset; + sector_t ps, doff; unsigned int size = PAGE_SIZE; unsigned int opt_size = PAGE_SIZE; -- cgit v1.2.3 From 3899d94e3831ee07ea6821c032dc297aec80586a Mon Sep 17 00:00:00 2001 From: Christoph Böhmwalder Date: Wed, 3 May 2023 14:19:37 +0200 Subject: drbd: correctly submit flush bio on barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we receive a flush command (or "barrier" in DRBD), we currently use a REQ_OP_FLUSH with the REQ_PREFLUSH flag set. The correct way to submit a flush bio is by using a REQ_OP_WRITE without any data, and set the REQ_PREFLUSH flag. Since commit b4a6bb3a67aa ("block: add a sanity check for non-write flush/fua bios"), this triggers a warning in the block layer, but this has been broken for quite some time before that. So use the correct set of flags to actually make the flush happen. Cc: Christoph Hellwig Cc: stable@vger.kernel.org Fixes: f9ff0da56437 ("drbd: allow parallel flushes for multi-volume resources") Reported-by: Thomas Voegtle Signed-off-by: Christoph Böhmwalder Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230503121937.17232-1-christoph.boehmwalder@linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index e54404c632e7..34b112752ab1 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1283,7 +1283,7 @@ static void one_flush_endio(struct bio *bio) static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) { struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0, - REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO); + REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO); struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); if (!octx) { -- cgit v1.2.3 From c0b79b0ff53be5b05be98e3caaa6a39de1fe9520 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 May 2023 10:42:31 +0800 Subject: ublk: add timeout handler Add timeout handler, so that we can provide forward progress guarantee for unprivileged ublk, which can't be trusted. One thing is that sync() calls sync_bdevs(wait) for all block devices after running sync_bdevs(no_wait), and if one device can't move on, the sync() won't return any more. Add timeout for unprivileged ublk to avoid such affect for other users which call sync() syscall. Meantime clear UBLK_F_USER_RECOVERY_REISSUE for unprivileged ublk since that feature may cause IO hang too. Fixes: 4093cb5a0634 ("ublk_drv: add mechanism for supporting unprivileged ublk device") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230502024231.888498-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'drivers') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 253008b2091d..e96309f2e1ad 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -129,6 +129,7 @@ struct ublk_queue { unsigned long io_addr; /* mapped vm address */ unsigned int max_io_sz; bool force_abort; + bool timeout; unsigned short nr_io_ready; /* how many ios setup */ struct ublk_device *dev; struct ublk_io ios[]; @@ -894,6 +895,22 @@ static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq) } } +static enum blk_eh_timer_return ublk_timeout(struct request *rq) +{ + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + if (ubq->flags & UBLK_F_UNPRIVILEGED_DEV) { + if (!ubq->timeout) { + send_sig(SIGKILL, ubq->ubq_daemon, 0); + ubq->timeout = true; + } + + return BLK_EH_DONE; + } + + return BLK_EH_RESET_TIMER; +} + static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -953,6 +970,7 @@ static const struct blk_mq_ops ublk_mq_ops = { .queue_rq = ublk_queue_rq, .init_hctx = ublk_init_hctx, .init_request = ublk_init_rq, + .timeout = ublk_timeout, }; static int ublk_ch_open(struct inode *inode, struct file *filp) @@ -1713,6 +1731,18 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) else if (!(info.flags & UBLK_F_UNPRIVILEGED_DEV)) return -EPERM; + /* + * unprivileged device can't be trusted, but RECOVERY and + * RECOVERY_REISSUE still may hang error handling, so can't + * support recovery features for unprivileged ublk now + * + * TODO: provide forward progress for RECOVERY handler, so that + * unprivileged device can benefit from it + */ + if (info.flags & UBLK_F_UNPRIVILEGED_DEV) + info.flags &= ~(UBLK_F_USER_RECOVERY_REISSUE | + UBLK_F_USER_RECOVERY); + /* the created device is always owned by current user */ ublk_store_owner_uid_gid(&info.owner_uid, &info.owner_gid); @@ -1981,6 +2011,7 @@ static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq) put_task_struct(ubq->ubq_daemon); /* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */ ubq->ubq_daemon = NULL; + ubq->timeout = false; for (i = 0; i < ubq->q_depth; i++) { struct ublk_io *io = &ubq->ios[i]; -- cgit v1.2.3