From c4e47bbb00dad9240f4c054859950e962042ecb8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 14:14:59 -0700 Subject: block: move cgroup time handling code into blk.h In preparation for moving time keeping into blk.h, move the cgroup related code for timestamps in here too. This will help avoid a circular dependency, and also moves it into a more appropriate header as this one is private to the block layer code. Leave struct bio_issue in blk_types.h as it's a proper time definition. Signed-off-by: Jens Axboe --- include/linux/blk_types.h | 42 ------------------------------------------ 1 file changed, 42 deletions(-) (limited to 'include') diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..1c07848dea7e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -206,52 +206,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; -- cgit v1.2.3 From da4c8c3d0975f031ef82d39927102e39fa6ddfac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Jan 2024 14:46:03 -0700 Subject: block: cache current nsec time in struct blk_plug Querying the current time is the most costly thing we do in the block layer per IO, and depending on kernel config settings, we may do it many times per IO. None of the callers actually need nsec granularity. Take advantage of that by caching the current time in the plug, with the assumption here being that any time checking will be temporally close enough that the slight loss of precision doesn't matter. If the block plug gets flushed, eg on preempt or schedule out, then we invalidate the cached clock. On a basic peak IOPS test case with iostats enabled, this changes the performance from: IOPS=108.41M, BW=52.93GiB/s, IOS/call=31/31 IOPS=108.43M, BW=52.94GiB/s, IOS/call=32/32 IOPS=108.29M, BW=52.88GiB/s, IOS/call=31/32 IOPS=108.35M, BW=52.91GiB/s, IOS/call=32/32 IOPS=108.42M, BW=52.94GiB/s, IOS/call=31/31 IOPS=108.40M, BW=52.93GiB/s, IOS/call=32/32 IOPS=108.31M, BW=52.89GiB/s, IOS/call=32/31 to IOPS=118.79M, BW=58.00GiB/s, IOS/call=31/32 IOPS=118.62M, BW=57.92GiB/s, IOS/call=31/31 IOPS=118.80M, BW=58.01GiB/s, IOS/call=32/31 IOPS=118.78M, BW=58.00GiB/s, IOS/call=32/32 IOPS=118.69M, BW=57.95GiB/s, IOS/call=32/31 IOPS=118.62M, BW=57.92GiB/s, IOS/call=32/31 IOPS=118.63M, BW=57.92GiB/s, IOS/call=31/32 which is more than a 9% improvement in performance. Looking at perf diff, we can see a huge reduction in time overhead: 10.55% -9.88% [kernel.vmlinux] [k] read_tsc 1.31% -1.22% [kernel.vmlinux] [k] ktime_get Note that since this relies on blk_plug for the caching, it's only applicable to the issue side. But this is where most of the time calls happen anyway. On the completion side, cached time stamping is done with struct io_comp patch, as long as the driver supports it. It's also worth noting that the above testing doesn't enable any of the higher cost CPU items on the block layer side, like wbt, cgroups, iocost, etc, which all would add additional time querying and hence overhead. IOW, results would likely look even better in comparison with those enabled, as distros would do. Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk.h | 14 +++++++++++++- include/linux/blkdev.h | 1 + 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index de771093b526..13b449df5ba0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1083,6 +1083,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; + plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); diff --git a/block/blk.h b/block/blk.h index 79ae533cdf02..14bbc4b780f2 100644 --- a/block/blk.h +++ b/block/blk.h @@ -519,7 +519,19 @@ static inline int req_ref_read(struct request *req) static inline u64 blk_time_get_ns(void) { - return ktime_get_ns(); + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) + plug->cur_ktime = ktime_get_ns(); + return plug->cur_ktime; } static inline ktime_t blk_time_get(void) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 99e4f5e72213..996d2ad756ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -942,6 +942,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; -- cgit v1.2.3 From 06b23f92af87a84d70881b2ecaa72e00f7838264 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Jan 2024 09:18:39 -0700 Subject: block: update cached timestamp post schedule/preemption Mark the task as having a cached timestamp when set assign it, so we can efficiently check if it needs updating post being scheduled back in. This covers both the actual schedule out case, which would've flushed the plug, and the preemption case which doesn't touch the plugged requests (for many reasons, one of them being then we'd need to have preemption disabled around plug state manipulation). Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- block/blk-core.c | 2 ++ block/blk.h | 4 +++- include/linux/blkdev.h | 16 ++++++++++++++++ include/linux/sched.h | 2 +- kernel/sched/core.c | 6 ++++-- 5 files changed, 26 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 13b449df5ba0..314c3065891a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1183,6 +1183,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + current->flags &= ~PF_BLOCK_TS; } /** diff --git a/block/blk.h b/block/blk.h index 14bbc4b780f2..913c93838a01 100644 --- a/block/blk.h +++ b/block/blk.h @@ -529,8 +529,10 @@ static inline u64 blk_time_get_ns(void) * a valid timestamp" separately, just accept that we'll do an extra * ktime_get_ns() if we just happen to get 0 as the current time. */ - if (!plug->cur_ktime) + if (!plug->cur_ktime) { plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } return plug->cur_ktime; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 996d2ad756ff..d7cac3de65b3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -973,6 +973,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1008,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..15b7cb478d16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..083f2258182d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6787,10 +6787,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } -- cgit v1.2.3 From 71f4ecdbb42addf82b01b734b122a02707fed521 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Sun, 28 Jan 2024 23:52:20 -0800 Subject: block: remove gfp_flags from blkdev_zone_mgmt Now that all callers pass in GFP_KERNEL to blkdev_zone_mgmt() and use memalloc_no{io,fs}_{save,restore}() to define the allocation scope, we can drop the gfp_mask parameter from blkdev_zone_mgmt() as well as blkdev_zone_reset_all() and blkdev_zone_reset_all_emulated(). Signed-off-by: Johannes Thumshirn Reviewed-by: Damien Le Moal Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20240128-zonefs_nofs-v3-5-ae3b7c8def61@wdc.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 19 ++++++++----------- drivers/md/dm-zoned-metadata.c | 2 +- drivers/nvme/target/zns.c | 5 ++--- fs/btrfs/zoned.c | 14 +++++--------- fs/f2fs/segment.c | 4 ++-- fs/zonefs/super.c | 2 +- include/linux/blkdev.h | 2 +- 7 files changed, 20 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d343e5756a9c..d4f4f8325eff 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -177,8 +177,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, } } -static int blkdev_zone_reset_all_emulated(struct block_device *bdev, - gfp_t gfp_mask) +static int blkdev_zone_reset_all_emulated(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); @@ -205,7 +204,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, } bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - gfp_mask); + GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -223,7 +222,7 @@ out_free_need_reset: return ret; } -static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -238,7 +237,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Perform the specified operation on the range of zones specified by @@ -248,7 +246,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) + sector_t sector, sector_t nr_sectors) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); @@ -285,12 +283,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev, gfp_mask); - return blkdev_zone_reset_all(bdev, gfp_mask); + return blkdev_zone_reset_all_emulated(bdev); + return blkdev_zone_reset_all(bdev); } while (sector < end_sector) { - bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -419,8 +417,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: if (cmd == BLKRESETZONE) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 165996cc966c..8156881a31de 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1660,7 +1660,7 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) noio_flag = memalloc_noio_save(); ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), - zmd->zone_nr_sectors, GFP_KERNEL); + zmd->zone_nr_sectors); memalloc_noio_restore(noio_flag); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 5b5c1e481722..3148d9f1bde6 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -456,8 +456,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) switch (zsa_req_op(req->cmd->zms.zsa)) { case REQ_OP_ZONE_RESET: ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, - get_capacity(req->ns->bdev->bd_disk), - GFP_KERNEL); + get_capacity(req->ns->bdev->bd_disk)); if (ret < 0) return blkdev_zone_mgmt_errno_to_nvme_status(ret); break; @@ -508,7 +507,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) goto out; } - ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors); if (ret < 0) status = blkdev_zone_mgmt_errno_to_nvme_status(ret); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 05640d61e435..cf2e779d8ef4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -830,8 +830,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_KERNEL); + reset->start, reset->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -984,7 +983,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_KERNEL); + zone->len); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -1023,8 +1022,7 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, - GFP_KERNEL); + zone_sectors * BTRFS_NR_SB_LOG_ZONES); memalloc_nofs_restore(nofs_flags); return ret; } @@ -1143,8 +1141,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, *bytes = 0; nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_KERNEL); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -2258,8 +2255,7 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_KERNEL); + zinfo->zone_size >> SECTOR_SHIFT); memalloc_nofs_restore(nofs_flags); if (ret) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0094fe491364..e1065ba70207 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1977,7 +1977,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_KERNEL); + sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } @@ -4921,7 +4921,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, - zone->start, zone->len, GFP_KERNEL); + zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 63fbac018c04..cadb1364f951 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb, trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, - z->z_size >> SECTOR_SHIFT, GFP_KERNEL); + z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d7cac3de65b3..fac580976e3a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -325,7 +325,7 @@ void disk_set_zoned(struct gendisk *disk); int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); + sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); -- cgit v1.2.3 From 60d21aac52e26531affdadb7543fe5b93f58b450 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 1 Feb 2024 18:31:25 +0530 Subject: block: support PI at non-zero offset within metadata Block layer integrity processing assumes that protection information (PI) is placed in the first bytes of each metadata block. Remove this limitation and include the metadata before the PI in the calculation of the guard tag. Signed-off-by: Kanchan Joshi Signed-off-by: Chinmay Gameti Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20240201130126.211402-3-joshi.k@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 1 + block/blk-integrity.c | 1 + block/t10-pi.c | 52 +++++++++++++++++++++++++++++++------------ include/linux/blk-integrity.h | 1 + include/linux/blkdev.h | 1 + 5 files changed, 42 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c9a16fba58b9..2e3e8e04961e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); + iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d4e9b4556d14..ccbeb6dfa87a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; + bi->pi_offset = template->pi_offset; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); diff --git a/block/t10-pi.c b/block/t10-pi.c index 251a7b188963..d90892fd6f2a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; pi->guard_tag = fn(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, + offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; BUG_ON(type == T10_PI_TYPE0_PROTECTION); for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; if (type == T10_PI_TYPE1_PROTECTION || @@ -84,6 +89,8 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, } csum = fn(0, iter->data_buf, iter->interval); + if (offset) + csum = fn(csum, iter->prot_buf, offset); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == virt) pi->ref_tag = cpu_to_be32(ref_tag); @@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == ref_tag) pi->ref_tag = cpu_to_be32(virt); @@ -288,12 +299,16 @@ static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), + iter->prot_buf, offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag) static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; u64 ref, seed; __be64 csum; @@ -344,6 +360,10 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, } csum = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, + offset); + if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ "(rcvd %016llx, want %016llx)\n", @@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) static void ext_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == virt) @@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == ref_tag) diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 378b2459efe2..e253e7bd0d17 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -20,6 +20,7 @@ struct blk_integrity_iter { unsigned int data_size; unsigned short interval; unsigned char tuple_size; + unsigned char pi_offset; const char *disk_name; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index fac580976e3a..0058783a4c43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; unsigned char tuple_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; -- cgit v1.2.3 From 8c4955c069ea3b77dc63b55d13afa9341e894849 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:11 +0100 Subject: block: move max_{open,active}_zones to struct queue_limits The maximum number of open and active zones is a limit on the queue and should be places there so that we can including it in the upcoming queue limits batch update API. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-2-hch@lst.de Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0058783a4c43..251a11d2d2ae 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -190,8 +190,6 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; - unsigned int max_open_zones; - unsigned int max_active_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ @@ -308,6 +306,8 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; bool zoned; + unsigned int max_open_zones; + unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -640,23 +640,23 @@ static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { - disk->max_open_zones = max_open_zones; + disk->queue->limits.max_open_zones = max_open_zones; } static inline void disk_set_max_active_zones(struct gendisk *disk, unsigned int max_active_zones) { - disk->max_active_zones = max_active_zones; + disk->queue->limits.max_active_zones = max_active_zones; } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { - return bdev->bd_disk->max_open_zones; + return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { - return bdev->bd_disk->max_active_zones; + return bdev->bd_disk->queue->limits.max_active_zones; } #else /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From d690cb8ae14bd377d422b7905b6959c7e7a45b95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:14 +0100 Subject: block: add an API to atomically update queue limits Add a new queue_limits_{start,commit}_update pair of functions that allows taking an atomic snapshot of queue limits, update it, and commit it if it passes validity checking. Also use the low-level validation helper to implement blk_set_default_limits instead of duplicating the initialization. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + block/blk-settings.c | 228 +++++++++++++++++++++++++++++++++++++++++-------- block/blk.h | 2 +- include/linux/blkdev.h | 23 +++++ 4 files changed, 217 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/block/blk-core.c b/block/blk-core.c index 2b11d8325fde..cb56724a8dfb 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -425,6 +425,7 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); diff --git a/block/blk-settings.c b/block/blk-settings.c index 24042f6b33d5..786b369ca599 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -25,42 +25,6 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); -/** - * blk_set_default_limits - reset limits to default values - * @lim: the queue_limits structure to reset - * - * Description: - * Returns a queue_limit struct to its default state. - */ -void blk_set_default_limits(struct queue_limits *lim) -{ - lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_segments = 1; - lim->max_integrity_segments = 0; - lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->virt_boundary_mask = 0; - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_user_sectors = lim->max_dev_sectors = 0; - lim->chunk_sectors = 0; - lim->max_write_zeroes_sectors = 0; - lim->max_zone_append_sectors = 0; - lim->max_discard_sectors = 0; - lim->max_hw_discard_sectors = 0; - lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 512; - lim->discard_alignment = 0; - lim->discard_misaligned = 0; - lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce = BLK_BOUNCE_NONE; - lim->alignment_offset = 0; - lim->io_opt = 0; - lim->misaligned = 0; - lim->zoned = false; - lim->zone_write_granularity = 0; - lim->dma_alignment = 511; -} - /** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset @@ -101,6 +65,198 @@ static void blk_apply_bdi_limits(struct backing_dev_info *bdi, bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } +static int blk_validate_zoned_limits(struct queue_limits *lim) +{ + if (!lim->zoned) { + if (WARN_ON_ONCE(lim->max_open_zones) || + WARN_ON_ONCE(lim->max_active_zones) || + WARN_ON_ONCE(lim->zone_write_granularity) || + WARN_ON_ONCE(lim->max_zone_append_sectors)) + return -EINVAL; + return 0; + } + + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) + return -EINVAL; + + if (lim->zone_write_granularity < lim->logical_block_size) + lim->zone_write_granularity = lim->logical_block_size; + + if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size + * and the zone size given that it can't span zones. + */ + lim->max_zone_append_sectors = + min3(lim->max_hw_sectors, + lim->max_zone_append_sectors, + lim->chunk_sectors); + } + + return 0; +} + +/* + * Check that the limits in lim are valid, initialize defaults for unset + * values, and cap values based on others where needed. + */ +static int blk_validate_limits(struct queue_limits *lim) +{ + unsigned int max_hw_sectors; + + /* + * Unless otherwise specified, default to 512 byte logical blocks and a + * physical block size equal to the logical block size. + */ + if (!lim->logical_block_size) + lim->logical_block_size = SECTOR_SIZE; + if (lim->physical_block_size < lim->logical_block_size) + lim->physical_block_size = lim->logical_block_size; + + /* + * The minimum I/O size defaults to the physical block size unless + * explicitly overridden. + */ + if (lim->io_min < lim->physical_block_size) + lim->io_min = lim->physical_block_size; + + /* + * max_hw_sectors has a somewhat weird default for historical reason, + * but driver really should set their own instead of relying on this + * value. + * + * The block layer relies on the fact that every driver can + * handle at lest a page worth of data per I/O, and needs the value + * aligned to the logical block size. + */ + if (!lim->max_hw_sectors) + lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) + return -EINVAL; + lim->max_hw_sectors = round_down(lim->max_hw_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * The actual max_sectors value is a complex beast and also takes the + * max_dev_sectors value (set by SCSI ULPs) and a user configurable + * value into account. The ->max_sectors value is always calculated + * from these, so directly setting it won't have any effect. + */ + max_hw_sectors = min_not_zero(lim->max_hw_sectors, + lim->max_dev_sectors); + if (lim->max_user_sectors) { + if (lim->max_user_sectors > max_hw_sectors || + lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + return -EINVAL; + lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else { + lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); + } + lim->max_sectors = round_down(lim->max_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * Random default for the maximum number of segments. Driver should not + * rely on this and set their own. + */ + if (!lim->max_segments) + lim->max_segments = BLK_MAX_SEGMENTS; + + lim->max_discard_sectors = lim->max_hw_discard_sectors; + if (!lim->max_discard_segments) + lim->max_discard_segments = 1; + + if (lim->discard_granularity < lim->physical_block_size) + lim->discard_granularity = lim->physical_block_size; + + /* + * By default there is no limit on the segment boundary alignment, + * but if there is one it can't be smaller than the page size as + * that would break all the normal I/O patterns. + */ + if (!lim->seg_boundary_mask) + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + return -EINVAL; + + /* + * The maximum segment size has an odd historic 64k default that + * drivers probably should override. Just like the I/O size we + * require drivers to at least handle a full page per segment. + */ + if (!lim->max_segment_size) + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + return -EINVAL; + + /* + * Devices that require a virtual boundary do not support scatter/gather + * I/O natively, but instead require a descriptor list entry for each + * page (which might not be identical to the Linux PAGE_SIZE). Because + * of that they are not limited by our notion of "segment size". + */ + if (lim->virt_boundary_mask) { + if (WARN_ON_ONCE(lim->max_segment_size && + lim->max_segment_size != UINT_MAX)) + return -EINVAL; + lim->max_segment_size = UINT_MAX; + } + + /* + * We require drivers to at least do logical block aligned I/O, but + * historically could not check for that due to the separate calls + * to set the limits. Once the transition is finished the check + * below should be narrowed down to check the logical block size. + */ + if (!lim->dma_alignment) + lim->dma_alignment = SECTOR_SIZE - 1; + if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) + return -EINVAL; + + if (lim->alignment_offset) { + lim->alignment_offset &= (lim->physical_block_size - 1); + lim->misaligned = 0; + } + + return blk_validate_zoned_limits(lim); +} + +/* + * Set the default limits for a newly allocated queue. @lim contains the + * initial limits set by the driver, which could be no limit in which case + * all fields are cleared to zero. + */ +int blk_set_default_limits(struct queue_limits *lim) +{ + return blk_validate_limits(lim); +} + +/** + * queue_limits_commit_update - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated by the caller to @q. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim) + __releases(q->limits_lock) +{ + int error = blk_validate_limits(lim); + + if (!error) { + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); + } + mutex_unlock(&q->limits_lock); + return error; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/block/blk.h b/block/blk.h index 913c93838a01..43c7e9180b30 100644 --- a/block/blk.h +++ b/block/blk.h @@ -330,7 +330,7 @@ void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_set_default_limits(struct queue_limits *lim); +int blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 251a11d2d2ae..d41d7fe93457 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -474,6 +474,7 @@ struct request_queue { struct mutex sysfs_lock; struct mutex sysfs_dir_lock; + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -862,6 +863,28 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, return chunk_sectors - (offset & (chunk_sectors - 1)); } +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. The caller must have frozen the queue or ensured + * that there is outstanding I/O by other means. + */ +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) + __acquires(q->limits_lock) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); + /* * Access functions for manipulating queue properties */ -- cgit v1.2.3 From 4f563a64732dabb2677c7d1232a8f714a18b41b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:16 +0100 Subject: block: add a max_user_discard_sectors queue limit Add a new max_user_discard_sectors limit that mirrors max_user_sectors and stores the value that the user manually set. This now allows updates of the max_hw_discard_sectors to not worry about the user limit. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-7-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 18 +++++++++++++++--- block/blk-sysfs.c | 17 ++++++++--------- include/linux/blkdev.h | 1 + 3 files changed, 24 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 786b369ca599..c4406aacc0ef 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -51,6 +51,7 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; + lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); @@ -162,7 +163,9 @@ static int blk_validate_limits(struct queue_limits *lim) if (!lim->max_segments) lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_sectors = lim->max_hw_discard_sectors; + lim->max_discard_sectors = + min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + if (!lim->max_discard_segments) lim->max_discard_segments = 1; @@ -228,6 +231,12 @@ static int blk_validate_limits(struct queue_limits *lim) */ int blk_set_default_limits(struct queue_limits *lim) { + /* + * Most defaults are set by capping the bounds in blk_validate_limits, + * but max_user_discard_sectors is special and needs an explicit + * initialization to the max value here. + */ + lim->max_user_discard_sectors = UINT_MAX; return blk_validate_limits(lim); } @@ -349,8 +358,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { - q->limits.max_hw_discard_sectors = max_discard_sectors; - q->limits.max_discard_sectors = max_discard_sectors; + struct queue_limits *lim = &q->limits; + + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_discard_sectors = + min(max_discard_sectors, lim->max_user_discard_sectors); } EXPORT_SYMBOL(blk_queue_max_discard_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 26607f9825cb..a1ec27f0ba41 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -174,23 +174,22 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) static ssize_t queue_discard_max_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_discard; - ssize_t ret = queue_var_store(&max_discard, page, count); + unsigned long max_discard_bytes; + ssize_t ret; + ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; - if (max_discard & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (q->limits.discard_granularity - 1)) return -EINVAL; - max_discard >>= 9; - if (max_discard > UINT_MAX) + if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - if (max_discard > q->limits.max_hw_discard_sectors) - max_discard = q->limits.max_hw_discard_sectors; - - q->limits.max_discard_sectors = max_discard; + q->limits.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + q->limits.max_discard_sectors = min(q->limits.max_hw_discard_sectors, + q->limits.max_user_discard_sectors); return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d41d7fe93457..45746ba73670 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -291,6 +291,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; -- cgit v1.2.3 From 9ac4dd8c47d533eb420af6a679e66ec74771125c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:19 +0100 Subject: block: pass a queue_limits argument to blk_mq_init_queue Pass a queue_limits to blk_mq_init_queue and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also rename the function to blk_mq_alloc_queue as that is a much better name for a function that allocates a queue and always pass the queuedata argument instead of having a separate version for the extra argument. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-10-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 21 ++++++++------------- block/bsg-lib.c | 2 +- drivers/nvme/host/apple.c | 2 +- drivers/nvme/host/core.c | 6 +++--- drivers/scsi/scsi_scan.c | 2 +- drivers/ufs/core/ufshcd.c | 2 +- include/linux/blk-mq.h | 3 ++- 7 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/block/blk-mq.c b/block/blk-mq.c index 9dd8055cc524..f6499bbd89be 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4083,14 +4083,14 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, - void *queuedata) +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata) { - struct queue_limits lim = { }; + struct queue_limits default_lim = { }; struct request_queue *q; int ret; - q = blk_alloc_queue(&lim, set->numa_node); + q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); if (IS_ERR(q)) return q; q->queuedata = queuedata; @@ -4101,20 +4101,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, } return q; } - -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) -{ - return blk_mq_init_queue_data(set, NULL); -} -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping - * the reference from blk_mq_init_queue() by calling blk_put_queue(). + * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ @@ -4141,7 +4136,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct request_queue *q; struct gendisk *disk; - q = blk_mq_init_queue_data(set, queuedata); + q = blk_mq_alloc_queue(set, NULL, queuedata); if (IS_ERR(q)) return ERR_CAST(q); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index b3acdbdb6e7e..bcc7dee6abce 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, if (blk_mq_alloc_tag_set(set)) goto out_tag_set; - q = blk_mq_init_queue(set); + q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out_queue; diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index c727cd1f264b..a480cdeac288 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset); + anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL); if (IS_ERR(anv->ctrl.admin_q)) { ret = -ENOMEM; goto put_dev; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6e7f9b13fba2..5bcdf3654598 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4372,14 +4372,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ret) return ret; - ctrl->admin_q = blk_mq_init_queue(set); + ctrl->admin_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->admin_q)) { ret = PTR_ERR(ctrl->admin_q); goto out_free_tagset; } if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->fabrics_q = blk_mq_init_queue(set); + ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->fabrics_q)) { ret = PTR_ERR(ctrl->fabrics_q); goto out_cleanup_admin_q; @@ -4443,7 +4443,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return ret; if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->connect_q = blk_mq_init_queue(set); + ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->connect_q)) { ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 44680f65ea14..9969f4e2f1c3 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -332,7 +332,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, sdev->sg_reserved_size = INT_MAX; - q = blk_mq_init_queue(&sdev->host->tag_set); + q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL); if (IS_ERR(q)) { /* release fn is set up in scsi_sysfs_device_initialise, so * have to free and put manually here */ diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 029d017fc1b6..c502a86db16b 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10592,7 +10592,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) goto out_remove_scsi_host; - hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set); + hba->tmf_queue = blk_mq_alloc_queue(&hba->tmf_tag_set, NULL, NULL); if (IS_ERR(hba->tmf_queue)) { err = PTR_ERR(hba->tmf_queue); goto free_tmf_tag_set; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..7d42c359e2ab 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -692,7 +692,8 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); -- cgit v1.2.3 From 27e32cd23fed1ab88098897897dcb9ec2bdba4de Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2024 08:34:20 +0100 Subject: block: pass a queue_limits argument to blk_mq_alloc_disk Pass a queue_limits to blk_mq_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: John Garry Reviewed-by: Chaitanya Kulkarni Reviewed-by: Ming Lei Reviewed-by: Damien Le Moal Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20240213073425.1621680-11-hch@lst.de Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 2 +- block/blk-mq.c | 5 +++-- drivers/block/amiflop.c | 2 +- drivers/block/aoe/aoeblk.c | 2 +- drivers/block/ataflop.c | 2 +- drivers/block/floppy.c | 2 +- drivers/block/loop.c | 2 +- drivers/block/mtip32xx/mtip32xx.c | 2 +- drivers/block/nbd.c | 2 +- drivers/block/null_blk/main.c | 2 +- drivers/block/ps3disk.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/block/sunvdc.c | 2 +- drivers/block/swim.c | 2 +- drivers/block/swim3.c | 2 +- drivers/block/ublk_drv.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/block/z2ram.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/memstick/core/ms_block.c | 2 +- drivers/memstick/core/mspro_block.c | 2 +- drivers/mmc/core/queue.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/mtd/ubi/block.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/s390/block/dasd_genhd.c | 2 +- drivers/s390/block/scm_blk.c | 2 +- include/linux/blk-mq.h | 7 ++++--- 30 files changed, 35 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 92ee2697ff39..25f1b18ce7d4 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -906,7 +906,7 @@ static int ubd_add(int n, char **error_out) if (err) goto out; - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, NULL, ubd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; diff --git a/block/blk-mq.c b/block/blk-mq.c index f6499bbd89be..6abb4ce46baa 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4130,13 +4130,14 @@ void blk_mq_destroy_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_destroy_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; - q = blk_mq_alloc_queue(set, NULL, queuedata); + q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 2b98114a9fe0..a25414228e47 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b1b47d88f5db..2ff6e2da8cc4 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -371,7 +371,7 @@ aoeblk_gdalloc(void *vp) goto err_mempool; } - gd = blk_mq_alloc_disk(set, d); + gd = blk_mq_alloc_disk(set, NULL, d); if (IS_ERR(gd)) { pr_err("aoe: cannot allocate block queue for %ld.%d\n", d->aoemajor, d->aoeminor); diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 50949207798d..cacc4ba942a8 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 2ba0ba135951..582cf50c6bf6 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -4518,7 +4518,7 @@ static int floppy_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); + disk = blk_mq_alloc_disk(&tag_sets[drive], NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8145499da38..3f855cc79c29 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2025,7 +2025,7 @@ static int loop_add(int i) if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, NULL, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b200950e8fb5..ac08dea73552 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3431,7 +3431,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto block_queue_alloc_tag_error; } - dd->disk = blk_mq_alloc_disk(&dd->tags, dd); + dd->disk = blk_mq_alloc_disk(&dd->tags, NULL, dd); if (IS_ERR(dd->disk)) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 33a8f37bb6a1..30ae3cc12e77 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1823,7 +1823,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err < 0) goto out_free_tags; - disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); + disk = blk_mq_alloc_disk(&nbd->tag_set, NULL, NULL); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_idr; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4281371c81fe..eeb895ec6f34 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2147,7 +2147,7 @@ static int null_add_dev(struct nullb_device *dev) goto out_cleanup_queues; nullb->tag_set->timeout = 5 * HZ; - nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, NULL, nullb); if (IS_ERR(nullb->disk)) { rv = PTR_ERR(nullb->disk); goto out_cleanup_tags; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 36d7b36c60c7..dfd3860df4f8 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -431,7 +431,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) if (error) goto fail_teardown; - gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); + gendisk = blk_mq_alloc_disk(&priv->tag_set, NULL, dev); if (IS_ERR(gendisk)) { dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", __func__, __LINE__); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 00ca8a1d8c46..6b4f1898a722 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4966,7 +4966,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (err) return err; - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, NULL, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4044c369d22a..d51be4f2df61 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1408,7 +1408,7 @@ static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, dev->size = le64_to_cpu(rsp->nsectors) * le16_to_cpu(rsp->logical_block_size); - dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, NULL, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); dev->queue = dev->gd->queue; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7bf4b48e2282..a1f74dd1eae5 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -824,7 +824,7 @@ static int probe_disk(struct vdc_port *port) if (err) return err; - g = blk_mq_alloc_disk(&port->tag_set, port); + g = blk_mq_alloc_disk(&port->tag_set, NULL, port); if (IS_ERR(g)) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index f85b6af414b4..16bdf62067d8 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c2bc85826358..a04756ac778e 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev, if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dfb2e77898b..c5b655270798 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2222,7 +2222,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); + disk = blk_mq_alloc_disk(&ub->tag_set, NULL, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5bf98fd6a651..a23fce4eca44 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1330,7 +1330,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (err) goto out_free_vq; - vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, NULL, vblk); if (IS_ERR(vblk->disk)) { err = PTR_ERR(vblk->disk); goto out_free_tags; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 434fab306777..4cc2884e7484 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -1136,7 +1136,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (err) goto out_release_minors; - gd = blk_mq_alloc_disk(&info->tag_set, info); + gd = blk_mq_alloc_disk(&info->tag_set, NULL, info); if (IS_ERR(gd)) { err = PTR_ERR(gd); goto out_free_tag_set; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 11493167b0a8..7c5f4e4d9b50 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&tag_set, NULL); + disk = blk_mq_alloc_disk(&tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d668b174ace9..1d044779f5e4 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -778,7 +778,7 @@ static int probe_gdrom(struct platform_device *devptr) if (err) goto probe_fail_free_cd_info; - gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); + gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL, NULL); if (IS_ERR(gd.disk)) { err = PTR_ERR(gd.disk); goto probe_fail_free_tag_set; diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 04115cd92433..d3277c901d16 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2093,7 +2093,7 @@ static int msb_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, NULL, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 5a69ed33999b..db0e2a42ca3c 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1138,7 +1138,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, NULL, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index a0a2412f62a7..67ad186d132a 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -447,7 +447,7 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) return ERR_PTR(ret); - disk = blk_mq_alloc_disk(&mq->tag_set, mq); + disk = blk_mq_alloc_disk(&mq->tag_set, NULL, mq); if (IS_ERR(disk)) { blk_mq_free_tag_set(&mq->tag_set); return disk; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f0526dcc2162..b8878a2457af 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -333,7 +333,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) goto out_kfree_tag_set; /* Create gendisk */ - gd = blk_mq_alloc_disk(new->tag_set, new); + gd = blk_mq_alloc_disk(new->tag_set, NULL, new); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tag_set; diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 654bd7372cd8..9be87c231a2e 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -393,7 +393,7 @@ int ubiblock_create(struct ubi_volume_info *vi) /* Initialize the gendisk of this ubiblock device */ - gd = blk_mq_alloc_disk(&dev->tag_set, dev); + gd = blk_mq_alloc_disk(&dev->tag_set, NULL, dev); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tags; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 5bcdf3654598..eed3e22e24d9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3694,7 +3694,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (!ns) return; - disk = blk_mq_alloc_disk(ctrl->tagset, ns); + disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); if (IS_ERR(disk)) goto out_free_ns; disk->fops = &nvme_bdev_ops; diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 30e8ee583e98..0465b706745f 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -53,7 +53,7 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (rc) return rc; - gdp = blk_mq_alloc_disk(&block->tag_set, block); + gdp = blk_mq_alloc_disk(&block->tag_set, NULL, block); if (IS_ERR(gdp)) { blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index ade95e91b3c8..d05b2e2799a4 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -462,7 +462,7 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) if (ret) goto out; - bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev); + bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, NULL, scmdev); if (IS_ERR(bdev->gendisk)) { ret = PTR_ERR(bdev->gendisk); goto out_tag; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7d42c359e2ab..390d35fa0032 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -682,13 +682,14 @@ enum { #define BLK_MQ_NO_HCTX_IDX (-1U) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -#define blk_mq_alloc_disk(set, queuedata) \ +#define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ - __blk_mq_alloc_disk(set, queuedata, &__key); \ + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -- cgit v1.2.3 From 74fa8f9c553f7b5ccab7d103acae63cc2e080465 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 15 Feb 2024 08:10:47 +0100 Subject: block: pass a queue_limits argument to blk_alloc_disk Pass a queue_limits to blk_alloc_disk and apply it if non-NULL. This will allow allocating queues with valid queue limits instead of setting the values one at a time later. Also change blk_alloc_disk to return an ERR_PTR instead of just NULL which can't distinguish errors. Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Himanshu Madhani Link: https://lore.kernel.org/r/20240215071055.2201424-2-hch@lst.de Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 6 ++++-- arch/xtensa/platforms/iss/simdisk.c | 8 +++++--- block/genhd.c | 11 ++++++----- drivers/block/brd.c | 7 ++++--- drivers/block/drbd/drbd_main.c | 6 ++++-- drivers/block/n64cart.c | 6 ++++-- drivers/block/null_blk/main.c | 7 ++++--- drivers/block/pktcdvd.c | 7 ++++--- drivers/block/ps3vram.c | 6 +++--- drivers/block/zram/zram_drv.c | 6 +++--- drivers/md/bcache/super.c | 4 ++-- drivers/md/dm.c | 4 ++-- drivers/md/md.c | 7 ++++--- drivers/nvdimm/btt.c | 8 ++++---- drivers/nvdimm/pmem.c | 6 +++--- drivers/nvme/host/multipath.c | 6 +++--- drivers/s390/block/dcssblk.c | 6 +++--- include/linux/blkdev.h | 10 +++++++--- 18 files changed, 69 insertions(+), 52 deletions(-) (limited to 'include') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index a708fbd5a844..539ff56b6968 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -117,9 +117,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->disk) + dev->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev->disk)) { + err = PTR_ERR(dev->disk); goto free_dev; + } dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 178cf96ca10a..defc67909a9c 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { char tmp[2] = { '0' + which, 0 }; - int err = -ENOMEM; + int err; dev->fd = -1; dev->filename = NULL; spin_lock_init(&dev->lock); dev->users = 0; - dev->gd = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->gd) + dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev->gd)) { + err = PTR_ERR(dev->gd); goto out; + } dev->gd->major = simdisk_major; dev->gd->first_minor = which; dev->gd->minors = SIMDISK_MINORS; diff --git a/block/genhd.c b/block/genhd.c index 7a8fd57c51f7..84c822d989da 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1391,20 +1391,21 @@ out_free_disk: return NULL; } -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass) { - struct queue_limits lim = { }; + struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(&lim, node); + q = blk_alloc_queue(lim ? lim : &default_lim, node); if (IS_ERR(q)) - return NULL; + return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); - return NULL; + return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 970bd6ff38c4..689a3c0c31f8 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -335,10 +335,11 @@ static int brd_alloc(int i) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = brd->brd_disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_free_dev; - + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6bc86106c7b2..cea1e537fd56 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2708,9 +2708,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_no_disk; + } device->vdisk = disk; device->rq_queue = disk->queue; diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index d914156db2d8..c64d7ee7a44d 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -131,9 +131,11 @@ static int __init n64cart_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out; + } disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART; diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index eeb895ec6f34..baf2b228d008 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -2154,10 +2154,11 @@ static int null_add_dev(struct nullb_device *dev) } nullb->q = nullb->disk->queue; } else if (dev->queue_mode == NULL_Q_BIO) { - rv = -ENOMEM; - nullb->disk = blk_alloc_disk(nullb->dev->home_node); - if (!nullb->disk) + nullb->disk = blk_alloc_disk(NULL, nullb->dev->home_node); + if (IS_ERR(nullb->disk)) { + rv = PTR_ERR(nullb->disk); goto out_cleanup_queues; + } nullb->q = nullb->disk->queue; rv = init_driver_queues(nullb); diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d56d972aadb3..abb82926b1c9 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2673,10 +2673,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_on = write_congestion_on; pd->write_congestion_off = write_congestion_off; - ret = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); goto out_mem; + } pd->disk = disk; disk->major = pktdev_major; disk->first_minor = idx; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 38d42af01b25..bdcf083b45e2 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - gendisk = blk_alloc_disk(NUMA_NO_NODE); - if (!gendisk) { + gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(gendisk)) { dev_err(&dev->core, "blk_alloc_disk failed\n"); - error = -ENOMEM; + error = PTR_ERR(gendisk); goto out_cache_cleanup; } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6772e0c654fa..84982221fc66 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2195,11 +2195,11 @@ static int zram_add(void) #endif /* gendisk structure */ - zram->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!zram->disk) { + zram->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(zram->disk)) { pr_err("Error allocating disk structure for device %d\n", device_id); - ret = -ENOMEM; + ret = PTR_ERR(zram->disk); goto out_free_idr; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index dc3f50f69714..9955ecff3839 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -935,8 +935,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove; - d->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!d->disk) + d->disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(d->disk)) goto out_bioset_exit; set_capacity(d->disk, sectors); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8dcabf84d866..b5e6a10b9cfd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2098,8 +2098,8 @@ static struct mapped_device *alloc_dev(int minor) * established. If request-based table is loaded: blk-mq will * override accordingly. */ - md->disk = blk_alloc_disk(md->numa_node_id); - if (!md->disk) + md->disk = blk_alloc_disk(NULL, md->numa_node_id); + if (IS_ERR(md->disk)) goto bad; md->queue = md->disk->queue; diff --git a/drivers/md/md.c b/drivers/md/md.c index e2a5f513dbb7..75266c34b1f9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5763,10 +5763,11 @@ struct mddev *md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - error = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + error = PTR_ERR(disk); goto out_free_mddev; + } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index bb3726b622ad..9a0eae01d598 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1496,11 +1496,11 @@ static int btt_blk_init(struct btt *btt) { struct nd_btt *nd_btt = btt->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - int rc = -ENOMEM; + int rc; - btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!btt->btt_disk) - return -ENOMEM; + btt->btt_disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(btt->btt_disk)) + return PTR_ERR(btt->btt_disk); nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); btt->btt_disk->first_minor = 0; diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4e8fdcb3f1c8..3a5df8d467c5 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -497,9 +497,9 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - disk = blk_alloc_disk(nid); - if (!disk) - return -ENOMEM; + disk = blk_alloc_disk(NULL, nid); + if (IS_ERR(disk)) + return PTR_ERR(disk); q = disk->queue; pmem->disk = disk; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74de1e64aeea..dc5d0d0a82d0 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -532,9 +532,9 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) !nvme_is_unique_nsid(ctrl, head) || !multipath) return 0; - head->disk = blk_alloc_disk(ctrl->numa_node); - if (!head->disk) - return -ENOMEM; + head->disk = blk_alloc_disk(NULL, ctrl->numa_node); + if (IS_ERR(head->disk)) + return PTR_ERR(head->disk); head->disk->fops = &nvme_ns_head_ops; head->disk->private_data = head; sprintf(head->disk->disk_name, "nvme%dn%d", diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 4b7ecd4fd431..0903b432ea97 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -629,9 +629,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dev.release = dcssblk_release_segment; dev_info->dev.groups = dcssblk_dev_attr_groups; INIT_LIST_HEAD(&dev_info->lh); - dev_info->gd = blk_alloc_disk(NUMA_NO_NODE); - if (dev_info->gd == NULL) { - rc = -ENOMEM; + dev_info->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev_info->gd)) { + rc = PTR_ERR(dev_info->gd); goto seg_list_del; } dev_info->gd->major = dcssblk_major; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 45746ba73670..a14ea9344138 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -766,22 +766,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, -- cgit v1.2.3 From b361c9027b4e4159e7bcca4eb64fd26507c19994 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Fri, 23 Feb 2024 15:57:48 +0000 Subject: sched: Add a new function to compare if two cpus have the same capacity The new helper function is needed to help blk-mq check if it needs to dispatch the softirq on another CPU to match the performance level the IO requester is running at. This is important on HMP systems where not all CPUs have the same compute capacity. Signed-off-by: Qais Yousef Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20240223155749.2958009-2-qyousef@layalina.io Signed-off-by: Jens Axboe --- include/linux/sched/topology.h | 6 ++++++ kernel/sched/core.c | 11 +++++++++++ 2 files changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..11e0e00e0bb9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -176,6 +176,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); +bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); @@ -226,6 +227,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { } +static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + return true; +} + static inline bool cpus_share_cache(int this_cpu, int that_cpu) { return true; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 083f2258182d..540f229700b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu) } } +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) -- cgit v1.2.3 From 13fe8e6825e44129b6cbeee41d3012554bf8d687 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 23 Feb 2024 15:55:39 +0800 Subject: ublk: add UBLK_CMD_DEL_DEV_ASYNC The current command UBLK_CMD_DEL_DEV won't return until the device is released, this way looks more reliable, but makes userspace more difficult to implement, especially about orders: unmap command buffer(which holds one ublkc reference), ublkc close, io_uring_file_unregister, ublkb close. Add UBLK_CMD_DEL_DEV_ASYNC so that device deletion won't wait release, then userspace needn't worry about the above order. Actually both loop and nbd is deleted in this async way. Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240223075539.89945-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 9 ++++++--- include/uapi/linux/ublk_cmd.h | 2 ++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 06d88d2008ba..bea3d5cf8a83 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -2468,7 +2468,7 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(struct ublk_device **p_ub) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { struct ublk_device *ub = *p_ub; int idx = ub->ub_number; @@ -2502,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) return -EINTR; return 0; } @@ -2901,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(&ub); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_U_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: ret = ublk_ctrl_get_queue_affinity(ub, cmd); diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..c8dc5f8ea699 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -49,6 +49,8 @@ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_FEATURES \ _IOR('u', 0x13, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV_ASYNC \ + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of -- cgit v1.2.3 From 631d4efb8009df64deae8c1382b8cf43879a4e22 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:40 -0800 Subject: block: add a queue_limits_set helper Add a small wrapper around queue_limits_commit_update for stacking drivers that don't want to update existing limits, but set an entirely new set. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 18 ++++++++++++++++++ include/linux/blkdev.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index b6bbe683d218..1989a177be20 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -266,6 +266,24 @@ int queue_limits_commit_update(struct request_queue *q, } EXPORT_SYMBOL_GPL(queue_limits_commit_update); +/** + * queue_limits_commit_set - apply queue limits to queue + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were freshly initialized to @q. + * To update existing limits use queue_limits_start_update() and + * queue_limits_commit_update() instead. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_set(struct request_queue *q, struct queue_limits *lim) +{ + mutex_lock(&q->limits_lock); + return queue_limits_commit_update(q, lim); +} +EXPORT_SYMBOL_GPL(queue_limits_set); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a14ea9344138..dd510ad7ce4b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -889,6 +889,7 @@ queue_limits_start_update(struct request_queue *q) } int queue_limits_commit_update(struct request_queue *q, struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); /* * Access functions for manipulating queue properties -- cgit v1.2.3 From c1373f1cf452e4c7553a9d3bc05d87ec15c4f85f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 28 Feb 2024 14:56:41 -0800 Subject: block: add a queue_limits_stack_bdev helper Add a small wrapper around blk_stack_limits that allows passing a bdev for the bottom device and prints an error in case of misaligned device. The name fits into the new queue limits API and the intent is to eventually replace disk_stack_limits. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240228225653.947152-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 25 +++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 1989a177be20..865fe4ebbf9b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -891,6 +891,31 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } EXPORT_SYMBOL(blk_stack_limits); +/** + * queue_limits_stack_bdev - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top device) + * @bdev: the underlying block device (bottom) + * @offset: offset to beginning of data within component device + * @pfx: prefix to use for warnings logged + * + * Description: + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. + */ +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx) +{ + if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + offset)) + pr_notice("%s: Warning: Device %pg is misaligned\n", + pfx, bdev); +} +EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); + /** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dd510ad7ce4b..285e82723d64 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -924,6 +924,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); -- cgit v1.2.3 From 25802f3ab7a589d2b5d9e1266acffad263d9893e Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:25 +0200 Subject: nvme-rdma: move NVME_RDMA_IP_PORT from common file The correct place for this definition is the nvme rdma header file and not the common nvme header file. Reviewed-by: Christoph Hellwig Reviewed-by: Israel Rukshin Reviewed-by: Sagi Grimberg Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- include/linux/nvme-rdma.h | 2 ++ include/linux/nvme.h | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 4dd7e6fe92fb..146dd2223a5f 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,6 +6,8 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H +#define NVME_RDMA_IP_PORT 4420 + #define NVME_RDMA_MAX_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bc605ec4a3fd..ce0c1143c7e4 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -23,8 +23,6 @@ #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -#define NVME_RDMA_IP_PORT 4420 - #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { -- cgit v1.2.3 From 36144964062b8676ee64281852de2a2c1b193aca Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:29 +0200 Subject: nvme-rdma: introduce NVME_RDMA_MAX_METADATA_QUEUE_SIZE definition This definition will be used by controllers that are configured with metadata support. For now, both regular and metadata controllers have the same maximal queue size but later commit will increase the maximal queue size for regular RDMA controllers to 256. We'll keep the maximal queue size for metadata controllers to be 128 since there are more resources that are needed for metadata operations and 128 is the optimal size found for metadata controllers base on testing. Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 2 ++ include/linux/nvme-rdma.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3a0f2c170f4c..b3f8416ec803 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -2015,6 +2015,8 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) { + if (ctrl->pi_support) + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; return NVME_RDMA_MAX_QUEUE_SIZE; } diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 146dd2223a5f..d0b9941911a1 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,7 +8,8 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit v1.2.3 From f096ba3286f5e773c496cf81667d01f2e8a2a37b Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Tue, 23 Jan 2024 16:40:32 +0200 Subject: nvmet-rdma: set max_queue_size for RDMA transport A new port configuration was added to set max_queue_size. Clamp user configuration to RDMA transport limits. Increase the maximal queue size of RDMA controllers from 128 to 256 (the default size stays 128 same as before). Reviewed-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Israel Rukshin Signed-off-by: Max Gurtovoy Signed-off-by: Keith Busch --- drivers/nvme/target/rdma.c | 8 ++++++++ include/linux/nvme-rdma.h | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index b3f8416ec803..f2bb9d95ecf4 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1956,6 +1956,14 @@ static int nvmet_rdma_add_port(struct nvmet_port *nport) nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; } + if (nport->max_queue_size < 0) { + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { + pr_warn("max_queue_size %u is too large, reducing to %u\n", + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index d0b9941911a1..eb2f04d636c8 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -8,8 +8,9 @@ #define NVME_RDMA_IP_PORT 4420 -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_MAX_QUEUE_SIZE 256 #define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 +#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, -- cgit v1.2.3 From f8c7511db009d42e2c24e48eeb04e3f1b67ab209 Mon Sep 17 00:00:00 2001 From: "Ricardo B. Marliere" Date: Tue, 5 Mar 2024 16:32:16 -0300 Subject: block: make block_class constant Since commit 43a7206b0963 ("driver core: class: make class_register() take a const *"), the driver core allows for struct class to be in read-only memory, so move the block_class structure to be declared at build time placing it into read-only memory, instead of having to be dynamically allocated at boot time. Cc: Greg Kroah-Hartman Suggested-by: Greg Kroah-Hartman Signed-off-by: Ricardo B. Marliere Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20240305-class_cleanup-block-v1-1-130bb27b9c72@marliere.net Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- drivers/base/base.h | 2 +- include/linux/blkdev.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/block/genhd.c b/block/genhd.c index 84c822d989da..a214f9cf3a35 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } -struct class block_class = { +const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; diff --git a/drivers/base/base.h b/drivers/base/base.h index eb4c0ace9242..0738ccad08b2 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK -extern struct class block_class; +extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..19c7596f4ebf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -42,7 +42,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. -- cgit v1.2.3 From dd27a84b06aa9ea6a94b0f3e59dc768f981962e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 3 Mar 2024 07:01:50 -0700 Subject: block: remove disk_stack_limits disk_stack_limits is unused now, remove it. Signed-off-by: Christoph Hellwig Reviewed--by: Song Liu Tested-by: Song Liu Signed-off-by: Song Liu Link: https://lore.kernel.org/r/20240303140150.5435-12-hch@lst.de --- block/blk-settings.c | 24 ------------------------ include/linux/blkdev.h | 2 -- 2 files changed, 26 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 13865a9f8972..3c7d8d638ab5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -916,30 +916,6 @@ void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, } EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); -/** - * disk_stack_limits - adjust queue limits for stacked drivers - * @disk: MD/DM gendisk (top) - * @bdev: the underlying block device (bottom) - * @offset: offset to beginning of data within component device - * - * Description: - * Merges the limits for a top level gendisk and a bottom level - * block_device. - */ -void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset) -{ - struct request_queue *t = disk->queue; - - if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) - pr_notice("%s: Warning: Device %pg is misaligned\n", - disk->disk_name, bdev); - - disk_update_readahead(disk); -} -EXPORT_SYMBOL(disk_stack_limits); - /** * blk_queue_update_dma_pad - update pad mask * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 285e82723d64..75c909865a8b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -926,8 +926,6 @@ extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, sector_t offset, const char *pfx); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); -- cgit v1.2.3