From 4a9d6d667f0bafed55a9e9f5ae8bceb3680749d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:10 +0100 Subject: block: don't call into the driver for BLKFLSBUF BLKFLSBUF is entirely contained in the block core, and there is no good reason to give the driver a hook into processing it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 3fbc382eb926..c6d8863f0409 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -369,15 +369,8 @@ static inline int is_unrecognized_ioctl(int ret) static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; - fsync_bdev(bdev); invalidate_bdev(bdev); return 0; -- cgit v1.2.3 From e00adcadf3af7a8335026d71ab9f0e0a922191ac Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:11 +0100 Subject: block: add a new set_read_only method Add a new method to allow for driver-specific processing when setting or clearing the block device read-only state. This allows to replace the cumbersome and error-prone override of the whole ioctl implementation. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 5 +++++ include/linux/blkdev.h | 1 + 2 files changed, 6 insertions(+) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index c6d8863f0409..a6fa16b97705 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -389,6 +389,11 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; + if (bdev->bd_disk->fops->set_read_only) { + ret = bdev->bd_disk->fops->set_read_only(bdev, n); + if (ret) + return ret; + } set_device_ro(bdev, n); return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 639cae2c158b..5c1ba8a8d2bc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1850,6 +1850,7 @@ struct block_device_operations { void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); + int (*set_read_only)(struct block_device *bdev, bool ro); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, -- cgit v1.2.3 From 732e12d805a77f74c907c0a28ece271ef1e81e01 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:15 +0100 Subject: block: don't call into the driver for BLKROSET Now that all drivers that want to hook into setting or clearing the read-only flag use the set_read_only method, this code can be removed. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index a6fa16b97705..96cb45447364 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -346,26 +346,6 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { @@ -384,9 +364,6 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; if (bdev->bd_disk->fops->set_read_only) { -- cgit v1.2.3 From 98f49b63e84d4ee1a5c327d0b5f4e8699f6c70fe Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:17 +0100 Subject: block: remove set_device_ro Fold set_device_ro into its only remaining caller. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 7 ------- block/ioctl.c | 2 +- include/linux/genhd.h | 1 - 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 9387f050c248..b85db1f2233c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1846,13 +1846,6 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_device_ro(struct block_device *bdev, int flag) -{ - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; diff --git a/block/ioctl.c b/block/ioctl.c index 96cb45447364..04255dc5f3bf 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -371,7 +371,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - set_device_ro(bdev, n); + bdev->bd_part->policy = n; return 0; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 03da3f603d30..52eb592c874a 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -304,7 +304,6 @@ extern void del_gendisk(struct gendisk *gp); extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); -extern void set_device_ro(struct block_device *bdev, int flag); extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) -- cgit v1.2.3 From a7cb3d2f09c8405aed59d97a7d02cebea43cd3c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Nov 2020 11:00:18 +0100 Subject: block: remove __blkdev_driver_ioctl Just open code it in the few callers. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/ioctl.c | 25 +++++-------------------- drivers/block/pktcdvd.c | 6 ++++-- drivers/md/bcache/request.c | 5 +++-- drivers/md/dm.c | 5 ++++- include/linux/blkdev.h | 2 -- 5 files changed, 16 insertions(+), 27 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 04255dc5f3bf..6b785181344f 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -219,23 +219,6 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) } #endif -int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - struct gendisk *disk = bdev->bd_disk; - - if (disk->fops->ioctl) - return disk->fops->ioctl(bdev, mode, cmd, arg); - - return -ENOTTY; -} -/* - * For the record: _GPL here is only because somebody decided to slap it - * on the previous export. Sheer idiocy, since it wasn't copyrightable - * at all and could be open-coded without any exports by anybody who cares. - */ -EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); - #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block @@ -594,10 +577,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); - if (ret == -ENOIOCTLCMD) - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ret != -ENOIOCTLCMD) + return ret; - return ret; + if (!bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 467dbd06b7cd..ef1c1f094ea4 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2584,9 +2584,11 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, case CDROM_LAST_WRITTEN: case CDROM_SEND_PACKET: case SCSI_IOCTL_SEND_COMMAND: - ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg); + if (!bdev->bd_disk->fops->ioctl) + ret = -ENOTTY; + else + ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); break; - default: pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 214326383145..afac8d07c1bd 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1230,8 +1230,9 @@ static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode, if (dc->io_disable) return -EIO; - - return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg); + if (!dc->bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return dc->bdev->bd_disk->fops->ioctl(dc->bdev, mode, cmd, arg); } void bch_cached_dev_request_init(struct cached_dev *dc) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index c18fc2548518..6db395c3d28b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -570,7 +570,10 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, } } - r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!bdev->bd_disk->fops->ioctl) + r = -ENOTTY; + else + r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); out: dm_unprepare_ioctl(md, srcu_idx); return r; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c1ba8a8d2bc..05b346a68c2e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1867,8 +1867,6 @@ extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, #define blkdev_compat_ptr_ioctl NULL #endif -extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, - unsigned long); extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -- cgit v1.2.3 From 6b3ba9762f9f9f651873af34481ca20e4a6791e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:24 +0100 Subject: block: cleanup del_gendisk a bit Merge three hidden gendisk checks into one. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b85db1f2233c..d41176eb1f36 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -895,6 +895,9 @@ void del_gendisk(struct gendisk *disk) might_sleep(); + if (WARN_ON_ONCE(!disk->queue)) + return; + blk_integrity_del(disk); disk_del_events(disk); @@ -917,20 +920,18 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; up_write(&disk->lookup_sem); - if (!(disk->flags & GENHD_FL_HIDDEN)) + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - if (disk->queue) { + /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - if (!(disk->flags & GENHD_FL_HIDDEN)) - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); - } else { - WARN_ON(1); + bdi_unregister(disk->queue->backing_dev_info); } + blk_unregister_queue(disk); + if (!(disk->flags & GENHD_FL_HIDDEN)) blk_unregister_region(disk_devt(disk), disk->minors); /* -- cgit v1.2.3 From 62b508f8b6b1b52843cd90f0b2068ed963f25bd3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:25 +0100 Subject: block: open code kobj_map into in block/genhd.c Copy and paste the kobj_map functionality in the block code in preparation for completely rewriting it. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 117 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index d41176eb1f36..667d1d6fd70a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -29,6 +28,16 @@ static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; +struct bdev_map { + struct bdev_map *next; + dev_t dev; + unsigned long range; + struct module *owner; + struct kobject *(*probe)(dev_t, int *, void *); + int (*lock)(dev_t, void *); + void *data; +} *bdev_map[255]; + /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -520,8 +529,6 @@ void unregister_blkdev(unsigned int major, const char *name) EXPORT_SYMBOL(unregister_blkdev); -static struct kobj_map *bdev_map; - /** * blk_mangle_minor - scatter minor numbers apart * @minor: minor number to mangle @@ -648,16 +655,60 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, struct kobject *(*probe)(dev_t, int *, void *), int (*lock)(dev_t, void *), void *data) { - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *p; + + n = min(n, 255u); + p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); + if (p == NULL) + return; + for (i = 0; i < n; i++, p++) { + p->owner = module; + p->probe = probe; + p->lock = lock; + p->dev = devt; + p->range = range; + p->data = data; + } + + mutex_lock(&block_class_lock); + for (i = 0, p -= n; i < n; i++, p++, index++) { + struct bdev_map **s = &bdev_map[index % 255]; + while (*s && (*s)->range < range) + s = &(*s)->next; + p->next = *s; + *s = p; + } + mutex_unlock(&block_class_lock); +} EXPORT_SYMBOL(blk_register_region); void blk_unregister_region(dev_t devt, unsigned long range) { - kobj_unmap(bdev_map, devt, range); -} + unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; + unsigned index = MAJOR(devt); + unsigned i; + struct bdev_map *found = NULL; + mutex_lock(&block_class_lock); + for (i = 0; i < min(n, 255u); i++, index++) { + struct bdev_map **s; + for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { + struct bdev_map *p = *s; + if (p->dev == devt && p->range == range) { + *s = p->next; + if (!found) + found = p; + break; + } + } + } + mutex_unlock(&block_class_lock); + kfree(found); +} EXPORT_SYMBOL(blk_unregister_region); static struct kobject *exact_match(dev_t devt, int *partno, void *data) @@ -979,6 +1030,47 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +{ + struct kobject *kobj; + struct bdev_map *p; + unsigned long best = ~0UL; + +retry: + mutex_lock(&block_class_lock); + for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { + struct kobject *(*probe)(dev_t, int *, void *); + struct module *owner; + void *data; + + if (p->dev > dev || p->dev + p->range - 1 < dev) + continue; + if (p->range - 1 >= best) + break; + if (!try_module_get(p->owner)) + continue; + owner = p->owner; + data = p->data; + probe = p->probe; + best = p->range - 1; + *partno = dev - p->dev; + if (p->lock && p->lock(dev, data) < 0) { + module_put(owner); + continue; + } + mutex_unlock(&block_class_lock); + kobj = probe(dev, partno, data); + /* Currently ->owner protects _only_ ->probe() itself. */ + module_put(owner); + if (kobj) + return dev_to_disk(kobj_to_dev(kobj)); + goto retry; + } + mutex_unlock(&block_class_lock); + return NULL; +} + + /** * get_gendisk - get partitioning information for a given device * @devt: device to get partitioning information for @@ -996,11 +1088,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); + disk = lookup_gendisk(devt, partno); } else { struct hd_struct *part; @@ -1213,6 +1301,22 @@ static struct kobject *base_probe(dev_t devt, int *partno, void *data) return NULL; } +static void bdev_map_init(void) +{ + struct bdev_map *base; + int i; + + base = kzalloc(sizeof(*base), GFP_KERNEL); + if (!base) + panic("cannot allocate bdev_map"); + + base->dev = 1; + base->range = ~0 ; + base->probe = base_probe; + for (i = 0; i < 255; i++) + bdev_map[i] = base; +} + static int __init genhd_device_init(void) { int error; @@ -1221,7 +1325,7 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); + bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -- cgit v1.2.3 From e49fbbbf0aa14f011ab037086f37f58bd058a6ee Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:26 +0100 Subject: block: split block_class_lock Split the block_class_lock mutex into one each to protect bdev_map and major_names. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 667d1d6fd70a..8226add353be 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,7 +25,6 @@ #include "blk.h" -static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; struct bdev_map { @@ -37,6 +36,7 @@ struct bdev_map { int (*lock)(dev_t, void *); void *data; } *bdev_map[255]; +static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) @@ -403,6 +403,7 @@ static struct blk_major_name { int major; char name[16]; } *major_names[BLKDEV_MAJOR_HASH_SIZE]; +static DEFINE_MUTEX(major_names_lock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -415,11 +416,11 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); } #endif /* CONFIG_PROC_FS */ @@ -448,7 +449,7 @@ int register_blkdev(unsigned int major, const char *name) struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { @@ -501,7 +502,7 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); return ret; } @@ -513,7 +514,7 @@ void unregister_blkdev(unsigned int major, const char *name) struct blk_major_name *p = NULL; int index = major_to_index(major); - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -523,7 +524,7 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); kfree(p); } @@ -674,7 +675,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->data = data; } - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0, p -= n; i < n; i++, p++, index++) { struct bdev_map **s = &bdev_map[index % 255]; while (*s && (*s)->range < range) @@ -682,7 +683,7 @@ void blk_register_region(dev_t devt, unsigned long range, struct module *module, p->next = *s; *s = p; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); } EXPORT_SYMBOL(blk_register_region); @@ -693,7 +694,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) unsigned i; struct bdev_map *found = NULL; - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (i = 0; i < min(n, 255u); i++, index++) { struct bdev_map **s; for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { @@ -706,7 +707,7 @@ void blk_unregister_region(dev_t devt, unsigned long range) } } } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kfree(found); } EXPORT_SYMBOL(blk_unregister_region); @@ -1037,7 +1038,7 @@ static struct gendisk *lookup_gendisk(dev_t dev, int *partno) unsigned long best = ~0UL; retry: - mutex_lock(&block_class_lock); + mutex_lock(&bdev_map_lock); for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { struct kobject *(*probe)(dev_t, int *, void *); struct module *owner; @@ -1058,7 +1059,7 @@ retry: module_put(owner); continue; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); kobj = probe(dev, partno, data); /* Currently ->owner protects _only_ ->probe() itself. */ module_put(owner); @@ -1066,7 +1067,7 @@ retry: return dev_to_disk(kobj_to_dev(kobj)); goto retry; } - mutex_unlock(&block_class_lock); + mutex_unlock(&bdev_map_lock); return NULL; } -- cgit v1.2.3 From bd8eff3ba2caca53ea72cf3cc87a7797771dd7d2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:27 +0100 Subject: block: rework requesting modules for unclaimed devices Instead of reusing the ranges in bdev_map, add a new helper that is called if no ranges was found. This is a first step to unpeel and eventually remove the complex ranges structure. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 8226add353be..81017bd3b333 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1031,6 +1031,13 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } +static void request_gendisk_module(dev_t devt) +{ + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); +} + static struct gendisk *lookup_gendisk(dev_t dev, int *partno) { struct kobject *kobj; @@ -1055,6 +1062,14 @@ retry: probe = p->probe; best = p->range - 1; *partno = dev - p->dev; + + if (!probe) { + mutex_unlock(&bdev_map_lock); + module_put(owner); + request_gendisk_module(dev); + goto retry; + } + if (p->lock && p->lock(dev, data) < 0) { module_put(owner); continue; @@ -1293,15 +1308,6 @@ static const struct seq_operations partitions_op = { }; #endif - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - static void bdev_map_init(void) { struct bdev_map *base; @@ -1313,7 +1319,6 @@ static void bdev_map_init(void) base->dev = 1; base->range = ~0 ; - base->probe = base_probe; for (i = 0; i < 255; i++) bdev_map[i] = base; } -- cgit v1.2.3 From a160c6159d4a0cf82f28bc1658a958e278ec3688 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:28 +0100 Subject: block: add an optional probe callback to major_names Add a callback to the major_names array that allows a driver to override how to probe for dev_t that doesn't currently have a gendisk registered. This will help separating the lookup of the gendisk by dev_t vs probe action for a not currently registered dev_t. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 21 ++++++++++++++++++--- include/linux/genhd.h | 5 ++++- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 81017bd3b333..20521163fd06 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -402,6 +402,7 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; + void (*probe)(dev_t devt); } *major_names[BLKDEV_MAJOR_HASH_SIZE]; static DEFINE_MUTEX(major_names_lock); @@ -444,7 +445,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. */ -int register_blkdev(unsigned int major, const char *name) +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; @@ -483,6 +485,7 @@ int register_blkdev(unsigned int major, const char *name) } p->major = major; + p->probe = probe; strlcpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); @@ -505,8 +508,7 @@ out: mutex_unlock(&major_names_lock); return ret; } - -EXPORT_SYMBOL(register_blkdev); +EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { @@ -1033,6 +1035,19 @@ static ssize_t disk_badblocks_store(struct device *dev, static void request_gendisk_module(dev_t devt) { + unsigned int major = MAJOR(devt); + struct blk_major_name **n; + + mutex_lock(&major_names_lock); + for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { + if ((*n)->major == major && (*n)->probe) { + (*n)->probe(devt); + mutex_unlock(&major_names_lock); + return; + } + } + mutex_unlock(&major_names_lock); + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) /* Make old-style 2.4 aliases work */ request_module("block-major-%d", MAJOR(devt)); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 52eb592c874a..811d0f83c4cf 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -367,7 +367,10 @@ extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk(minors) alloc_disk_node(minors, NUMA_NO_NODE) -int register_blkdev(unsigned int major, const char *name); +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)); +#define register_blkdev(major, name) \ + __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); void revalidate_disk_size(struct gendisk *disk, bool verbose); -- cgit v1.2.3 From e418de3abcda8b102f737919e830024d1455938f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2020 15:58:41 +0100 Subject: block: switch gendisk lookup to a simple xarray Now that bdev_map is only used for finding gendisks, we can use a simple xarray instead of the regions tracking structure for it. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Greg Kroah-Hartman Signed-off-by: Jens Axboe --- block/genhd.c | 208 +++++++++----------------------------------------- include/linux/genhd.h | 7 -- 2 files changed, 37 insertions(+), 178 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 20521163fd06..01d146598fe7 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,15 +27,7 @@ static struct kobject *block_depr; -struct bdev_map { - struct bdev_map *next; - dev_t dev; - unsigned long range; - struct module *owner; - struct kobject *(*probe)(dev_t, int *, void *); - int (*lock)(dev_t, void *); - void *data; -} *bdev_map[255]; +static DEFINE_XARRAY(bdev_map); static DEFINE_MUTEX(bdev_map_lock); /* for extended dynamic devt allocation, currently only one major is used */ @@ -649,85 +641,26 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) -{ - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *p; - - n = min(n, 255u); - p = kmalloc_array(n, sizeof(struct bdev_map), GFP_KERNEL); - if (p == NULL) - return; - - for (i = 0; i < n; i++, p++) { - p->owner = module; - p->probe = probe; - p->lock = lock; - p->dev = devt; - p->range = range; - p->data = data; - } +static void blk_register_region(struct gendisk *disk) +{ + int i; mutex_lock(&bdev_map_lock); - for (i = 0, p -= n; i < n; i++, p++, index++) { - struct bdev_map **s = &bdev_map[index % 255]; - while (*s && (*s)->range < range) - s = &(*s)->next; - p->next = *s; - *s = p; + for (i = 0; i < disk->minors; i++) { + if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) + WARN_ON_ONCE(1); } mutex_unlock(&bdev_map_lock); } -EXPORT_SYMBOL(blk_register_region); -void blk_unregister_region(dev_t devt, unsigned long range) +static void blk_unregister_region(struct gendisk *disk) { - unsigned n = MAJOR(devt + range - 1) - MAJOR(devt) + 1; - unsigned index = MAJOR(devt); - unsigned i; - struct bdev_map *found = NULL; + int i; mutex_lock(&bdev_map_lock); - for (i = 0; i < min(n, 255u); i++, index++) { - struct bdev_map **s; - for (s = &bdev_map[index % 255]; *s; s = &(*s)->next) { - struct bdev_map *p = *s; - if (p->dev == devt && p->range == range) { - *s = p->next; - if (!found) - found = p; - break; - } - } - } + for (i = 0; i < disk->minors; i++) + xa_erase(&bdev_map, disk_devt(disk) + i); mutex_unlock(&bdev_map_lock); - kfree(found); -} -EXPORT_SYMBOL(blk_unregister_region); - -static struct kobject *exact_match(dev_t devt, int *partno, void *data) -{ - struct gendisk *p = data; - - return &disk_to_dev(p)->kobj; -} - -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; - - if (!get_disk_and_module(p)) - return -1; - return 0; } static void disk_scan_partitions(struct gendisk *disk) @@ -873,8 +806,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); + blk_register_region(disk); } register_disk(parent, disk, groups); if (register_queue) @@ -987,7 +919,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk_devt(disk), disk->minors); + blk_unregister_region(disk); /* * Remove gendisk pointer from idr so that it cannot be looked up * while RCU period before freeing gendisk is running to prevent @@ -1053,54 +985,22 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static struct gendisk *lookup_gendisk(dev_t dev, int *partno) +static bool get_disk_and_module(struct gendisk *disk) { - struct kobject *kobj; - struct bdev_map *p; - unsigned long best = ~0UL; - -retry: - mutex_lock(&bdev_map_lock); - for (p = bdev_map[MAJOR(dev) % 255]; p; p = p->next) { - struct kobject *(*probe)(dev_t, int *, void *); - struct module *owner; - void *data; - - if (p->dev > dev || p->dev + p->range - 1 < dev) - continue; - if (p->range - 1 >= best) - break; - if (!try_module_get(p->owner)) - continue; - owner = p->owner; - data = p->data; - probe = p->probe; - best = p->range - 1; - *partno = dev - p->dev; - - if (!probe) { - mutex_unlock(&bdev_map_lock); - module_put(owner); - request_gendisk_module(dev); - goto retry; - } + struct module *owner; - if (p->lock && p->lock(dev, data) < 0) { - module_put(owner); - continue; - } - mutex_unlock(&bdev_map_lock); - kobj = probe(dev, partno, data); - /* Currently ->owner protects _only_ ->probe() itself. */ + if (!disk->fops) + return false; + owner = disk->fops->owner; + if (owner && !try_module_get(owner)) + return false; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { module_put(owner); - if (kobj) - return dev_to_disk(kobj_to_dev(kobj)); - goto retry; + return false; } - mutex_unlock(&bdev_map_lock); - return NULL; -} + return true; +} /** * get_gendisk - get partitioning information for a given device @@ -1119,7 +1019,19 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) might_sleep(); if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - disk = lookup_gendisk(devt, partno); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + if (!disk) { + mutex_unlock(&bdev_map_lock); + request_gendisk_module(devt); + mutex_lock(&bdev_map_lock); + disk = xa_load(&bdev_map, devt); + } + if (disk && !get_disk_and_module(disk)) + disk = NULL; + if (disk) + *partno = devt - disk_devt(disk); + mutex_unlock(&bdev_map_lock); } else { struct hd_struct *part; @@ -1323,21 +1235,6 @@ static const struct seq_operations partitions_op = { }; #endif -static void bdev_map_init(void) -{ - struct bdev_map *base; - int i; - - base = kzalloc(sizeof(*base), GFP_KERNEL); - if (!base) - panic("cannot allocate bdev_map"); - - base->dev = 1; - base->range = ~0 ; - for (i = 0; i < 255; i++) - bdev_map[i] = base; -} - static int __init genhd_device_init(void) { int error; @@ -1346,7 +1243,6 @@ static int __init genhd_device_init(void) error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map_init(); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); @@ -1895,35 +1791,6 @@ out_free_disk: } EXPORT_SYMBOL(__alloc_disk_node); -/** - * get_disk_and_module - increments the gendisk and gendisk fops module refcount - * @disk: the struct gendisk to increment the refcount for - * - * This increments the refcount for the struct gendisk, and the gendisk's - * fops module owner. - * - * Context: Any context. - */ -struct kobject *get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - struct kobject *kobj; - - if (!disk->fops) - return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); - return NULL; - } - return kobj; - -} -EXPORT_SYMBOL(get_disk_and_module); - /** * put_disk - decrements the gendisk refcount * @disk: the struct gendisk to decrement the refcount for @@ -1960,7 +1827,6 @@ void put_disk_and_module(struct gendisk *disk) module_put(owner); } } -EXPORT_SYMBOL(put_disk_and_module); static void set_disk_ro_uevent(struct gendisk *gd, int ro) { diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 811d0f83c4cf..3cbc2781ef34 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -339,15 +339,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev); int blk_drop_partitions(struct block_device *bdev); extern struct gendisk *__alloc_disk_node(int minors, int node_id); -extern struct kobject *get_disk_and_module(struct gendisk *disk); extern void put_disk(struct gendisk *disk); extern void put_disk_and_module(struct gendisk *disk); -extern void blk_register_region(dev_t devt, unsigned long range, - struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), - void *data); -extern void blk_unregister_region(dev_t devt, unsigned long range); #define alloc_disk_node(minors, node_id) \ ({ \ -- cgit v1.2.3 From e2b6b301871719d1db0b1ed7a1ed9e06750c80fc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 14 Nov 2020 18:08:21 +0100 Subject: block: fix the kerneldoc comment for __register_blkdev Switch the comment to talk about __register_blkdev instead of register_blkdev and document the new probe parameter. Fixes: 3da1a61e7046 ("block: add an optional probe callback to major_names") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/genhd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 01d146598fe7..ec2a24799cd9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -418,11 +418,12 @@ void blkdev_show(struct seq_file *seqf, off_t offset) #endif /* CONFIG_PROC_FS */ /** - * register_blkdev - register a new block device + * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string + * @probe: allback that is called on access to any minor number of @major * * The @name must be unique within the system. * @@ -436,6 +437,8 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. + * + * Use register_blkdev instead for any new code. */ int __register_blkdev(unsigned int major, const char *name, void (*probe)(dev_t devt)) -- cgit v1.2.3 From 449f4ec9892ebc2f37a7eae6d97db2cf7c65e09a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 16 Nov 2020 15:56:56 +0100 Subject: block: remove the update_bdev parameter to set_capacity_revalidate_and_notify The update_bdev argument is always set to true, so remove it. Also rename the function to the slighly less verbose set_capacity_and_notify, as propagating the disk size to the block device isn't really revalidation. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Reviewed-by: Petr Vorel Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++++-------- drivers/block/loop.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/nvme/host/core.c | 2 +- drivers/scsi/sd.c | 5 ++--- include/linux/genhd.h | 3 +-- 7 files changed, 12 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ec2a24799cd9..4e039524f92b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,17 +47,15 @@ static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); /* - * Set disk capacity and notify if the size is not currently - * zero and will not be set to zero + * Set disk capacity and notify if the size is not currently zero and will not + * be set to zero. Returns true if a uevent was sent, otherwise false. */ -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev) +bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); set_capacity(disk, size); - if (update_bdev) - revalidate_disk_size(disk, true); + revalidate_disk_size(disk, true); if (capacity != size && capacity != 0 && size != 0) { char *envp[] = { "RESIZE=1", NULL }; @@ -68,8 +66,7 @@ bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, return false; } - -EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); +EXPORT_SYMBOL_GPL(set_capacity_and_notify); /* * Format the device name of the indicated disk into the supplied buffer and diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fcc5e32f0993..9a27d4f1c08a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -251,7 +251,7 @@ loop_validate_block_size(unsigned short bsize) */ static void loop_set_size(struct loop_device *lo, loff_t size) { - if (!set_capacity_revalidate_and_notify(lo->lo_disk, size, true)) + if (!set_capacity_and_notify(lo->lo_disk, size)) kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index a314b9382442..3e812b4c32e6 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -470,7 +470,7 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) cap_str_10, cap_str_2); - set_capacity_revalidate_and_notify(vblk->disk, capacity, true); + set_capacity_and_notify(vblk->disk, capacity); } static void virtblk_config_changed_work(struct work_struct *work) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 48629d3433b4..79521e33d30e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2370,7 +2370,7 @@ static void blkfront_connect(struct blkfront_info *info) return; printk(KERN_INFO "Setting capacity to %Lu\n", sectors); - set_capacity_revalidate_and_notify(info->gd, sectors, true); + set_capacity_and_notify(info->gd, sectors); return; case BLKIF_STATE_SUSPENDED: diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f6c6479da0e9..6c144e748f8c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2053,7 +2053,7 @@ static void nvme_update_disk_info(struct gendisk *disk, capacity = 0; } - set_capacity_revalidate_and_notify(disk, capacity, true); + set_capacity_and_notify(disk, capacity); nvme_config_discard(disk, ns); nvme_config_write_zeroes(disk, ns); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index e9b898a1a480..679c2c025047 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3261,8 +3261,7 @@ static int sd_revalidate_disk(struct gendisk *disk) sdkp->first_scan = 0; - set_capacity_revalidate_and_notify(disk, - logical_to_sectors(sdp, sdkp->capacity), true); + set_capacity_and_notify(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); kfree(buffer); @@ -3272,7 +3271,7 @@ static int sd_revalidate_disk(struct gendisk *disk) * capacity to 0. */ if (sd_zbc_revalidate_zones(sdkp)) - set_capacity_revalidate_and_notify(disk, 0, true); + set_capacity_and_notify(disk, 0); out: return 0; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3cbc2781ef34..46553d6d6025 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -314,8 +314,7 @@ static inline int get_disk_ro(struct gendisk *disk) extern void disk_block_events(struct gendisk *disk); extern void disk_unblock_events(struct gendisk *disk); extern void disk_flush_events(struct gendisk *disk, unsigned int mask); -bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool update_bdev); +bool set_capacity_and_notify(struct gendisk *disk, sector_t size); /* drivers/char/random.c */ extern void add_disk_randomness(struct gendisk *disk) __latent_entropy; -- cgit v1.2.3 From 5a20d073ec54a72d9a732fa44bfe14954eb6332f Mon Sep 17 00:00:00 2001 From: Lei Chen Date: Mon, 30 Nov 2020 10:20:52 +0800 Subject: block: wbt: Remove unnecessary invoking of wbt_update_limits in wbt_init It's unnecessary to call wbt_update_limits explicitly within wbt_init, because it will be called in the following function wbt_queue_depth_changed. Signed-off-by: Lei Chen Signed-off-by: Jens Axboe --- block/blk-wbt.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block') diff --git a/block/blk-wbt.c b/block/blk-wbt.c index fd410086fe1d..0321ca83e73f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -835,7 +835,6 @@ int wbt_init(struct request_queue *q) rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; - wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. -- cgit v1.2.3 From 3f50b95e0edd22824b2650eb65466bf7060f7488 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:43:52 +0100 Subject: block: remove a superflous check in blkpg_do_ioctl sector_t is now always a u64, so this check is not needed. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/ioctl.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 6b785181344f..0c09bb7a6ff3 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -35,15 +35,6 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) < sizeof(long long)) { - long pstart = start, plength = length; - - if (pstart != start || plength != length || pstart < 0 || - plength < 0 || p.pno > 65535) - return -EINVAL; - } - switch (op) { case BLKPG_ADD_PARTITION: /* check if partition is aligned to blocksize */ -- cgit v1.2.3 From e79319af6d8cfd7311fef1bfbb1c59c94e6e10a9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 06:48:53 +0100 Subject: block: use disk_part_iter_exit in disk_part_iter_next Call disk_part_iter_exit in disk_part_iter_next instead of duplicating the functionality. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4e039524f92b..0bd9c41dd4cb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -227,8 +227,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) int inc, end; /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; + disk_part_iter_exit(piter); /* get part_tbl */ rcu_read_lock(); -- cgit v1.2.3 From efdc41c8d49fc1ff9bbef8f68f1cf1d8d59164a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Nov 2020 07:25:37 +0100 Subject: block: use put_device in put_disk Use put_device to put the device instead of poking into the internals and using kobject_put. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0bd9c41dd4cb..f46e89226fdf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1803,7 +1803,7 @@ EXPORT_SYMBOL(__alloc_disk_node); void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk_to_dev(disk)->kobj); + put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); -- cgit v1.2.3 From 4e7b5671c6a883d94b5428e1a9c141bbd56cb2a6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 13:38:40 +0100 Subject: block: remove i_bdev Switch the block device lookup interfaces to directly work with a dev_t so that struct block_device references are only acquired by the blkdev_get variants (and the blk-cgroup special case). This means that we now don't need an extra reference in the inode and can generally simplify handling of struct block_device to keep the lookups contained in the core block layer code. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Tejun Heo Acked-by: Coly Li [bcache] Signed-off-by: Jens Axboe --- block/ioctl.c | 3 +- drivers/block/loop.c | 8 +- drivers/md/bcache/super.c | 20 +-- drivers/md/dm-table.c | 9 +- drivers/mtd/mtdsuper.c | 17 +-- drivers/target/target_core_file.c | 6 +- drivers/usb/gadget/function/storage_common.c | 8 +- fs/block_dev.c | 196 +++++++-------------------- fs/btrfs/volumes.c | 13 +- fs/inode.c | 3 - fs/internal.h | 7 +- fs/io_uring.c | 10 +- fs/pipe.c | 5 +- fs/quota/quota.c | 19 ++- fs/statfs.c | 2 +- fs/super.c | 44 +++--- include/linux/blkdev.h | 2 +- include/linux/fs.h | 1 - 18 files changed, 121 insertions(+), 252 deletions(-) (limited to 'block') diff --git a/block/ioctl.c b/block/ioctl.c index 0c09bb7a6ff3..a6d8171221c7 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -590,8 +590,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; loff_t size; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b42c728620c9..26c7aafba7c5 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -675,10 +675,10 @@ static int loop_validate_file(struct file *file, struct block_device *bdev) while (is_loop_device(f)) { struct loop_device *l; - if (f->f_mapping->host->i_bdev == bdev) + if (f->f_mapping->host->i_rdev == bdev->bd_dev) return -EBADF; - l = f->f_mapping->host->i_bdev->bd_disk->private_data; + l = I_BDEV(f->f_mapping->host)->bd_disk->private_data; if (l->lo_state != Lo_bound) { return -EINVAL; } @@ -885,9 +885,7 @@ static void loop_config_discard(struct loop_device *lo) * file-backed loop devices: discarded regions read back as zero. */ if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) { - struct request_queue *backingq; - - backingq = bdev_get_queue(inode->i_bdev); + struct request_queue *backingq = bdev_get_queue(I_BDEV(inode)); max_discard_sectors = backingq->limits.max_write_zeroes_sectors; granularity = backingq->limits.discard_granularity ?: diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a6a5e21e4fd1..c55d3c58a7ef 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2380,38 +2380,38 @@ kobj_attribute_write(register, register_bcache); kobj_attribute_write(register_quiet, register_bcache); kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup); -static bool bch_is_open_backing(struct block_device *bdev) +static bool bch_is_open_backing(dev_t dev) { struct cache_set *c, *tc; struct cached_dev *dc, *t; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) list_for_each_entry_safe(dc, t, &c->cached_devs, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; list_for_each_entry_safe(dc, t, &uncached_devices, list) - if (dc->bdev == bdev) + if (dc->bdev->bd_dev == dev) return true; return false; } -static bool bch_is_open_cache(struct block_device *bdev) +static bool bch_is_open_cache(dev_t dev) { struct cache_set *c, *tc; list_for_each_entry_safe(c, tc, &bch_cache_sets, list) { struct cache *ca = c->cache; - if (ca->bdev == bdev) + if (ca->bdev->bd_dev == dev) return true; } return false; } -static bool bch_is_open(struct block_device *bdev) +static bool bch_is_open(dev_t dev) { - return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); + return bch_is_open_cache(dev) || bch_is_open_backing(dev); } struct async_reg_args { @@ -2535,9 +2535,11 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, sb); if (IS_ERR(bdev)) { if (bdev == ERR_PTR(-EBUSY)) { - bdev = lookup_bdev(strim(path)); + dev_t dev; + mutex_lock(&bch_register_lock); - if (!IS_ERR(bdev) && bch_is_open(bdev)) + if (lookup_bdev(strim(path), &dev) == 0 && + bch_is_open(dev)) err = "device already registered"; else err = "device busy"; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ce543b761be7..dea677721710 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -348,16 +348,9 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, dev_t dm_get_dev_t(const char *path) { dev_t dev; - struct block_device *bdev; - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) + if (lookup_bdev(path, &dev)) dev = name_to_dev_t(path); - else { - dev = bdev->bd_dev; - bdput(bdev); - } - return dev; } EXPORT_SYMBOL_GPL(dm_get_dev_t); diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c index c3e2098372f2..38b6aa849c63 100644 --- a/drivers/mtd/mtdsuper.c +++ b/drivers/mtd/mtdsuper.c @@ -120,8 +120,8 @@ int get_tree_mtd(struct fs_context *fc, struct fs_context *fc)) { #ifdef CONFIG_BLOCK - struct block_device *bdev; - int ret, major; + dev_t dev; + int ret; #endif int mtdnr; @@ -169,20 +169,15 @@ int get_tree_mtd(struct fs_context *fc, /* try the old way - the hack where we allowed users to mount * /dev/mtdblock$(n) but didn't actually _use_ the blockdev */ - bdev = lookup_bdev(fc->source); - if (IS_ERR(bdev)) { - ret = PTR_ERR(bdev); + ret = lookup_bdev(fc->source, &dev); + if (ret) { errorf(fc, "MTD: Couldn't look up '%s': %d", fc->source, ret); return ret; } pr_debug("MTDSB: lookup_bdev() returned 0\n"); - major = MAJOR(bdev->bd_dev); - mtdnr = MINOR(bdev->bd_dev); - bdput(bdev); - - if (major == MTD_BLOCK_MAJOR) - return mtd_get_sb_by_nr(fc, mtdnr, fill_super); + if (MAJOR(dev) == MTD_BLOCK_MAJOR) + return mtd_get_sb_by_nr(fc, MINOR(dev), fill_super); #endif /* CONFIG_BLOCK */ diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 7143d03f0e02..b0cb5b95e892 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -133,10 +133,10 @@ static int fd_configure_device(struct se_device *dev) */ inode = file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { - struct request_queue *q = bdev_get_queue(inode->i_bdev); + struct request_queue *q = bdev_get_queue(I_BDEV(inode)); unsigned long long dev_size; - fd_dev->fd_block_size = bdev_logical_block_size(inode->i_bdev); + fd_dev->fd_block_size = bdev_logical_block_size(I_BDEV(inode)); /* * Determine the number of bytes from i_size_read() minus * one (1) logical sector from underlying struct block_device @@ -559,7 +559,7 @@ fd_execute_unmap(struct se_cmd *cmd, sector_t lba, sector_t nolb) if (S_ISBLK(inode->i_mode)) { /* The backend is block device, use discard */ - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(inode); struct se_device *dev = cmd->se_dev; ret = blkdev_issue_discard(bdev, diff --git a/drivers/usb/gadget/function/storage_common.c b/drivers/usb/gadget/function/storage_common.c index f7e6c42558eb..b859a158a414 100644 --- a/drivers/usb/gadget/function/storage_common.c +++ b/drivers/usb/gadget/function/storage_common.c @@ -204,7 +204,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (!(filp->f_mode & FMODE_WRITE)) ro = 1; - inode = file_inode(filp); + inode = filp->f_mapping->host; if ((!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))) { LINFO(curlun, "invalid file type: %s\n", filename); goto out; @@ -221,7 +221,7 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (!(filp->f_mode & FMODE_CAN_WRITE)) ro = 1; - size = i_size_read(inode->i_mapping->host); + size = i_size_read(inode); if (size < 0) { LINFO(curlun, "unable to find file size: %s\n", filename); rc = (int) size; @@ -231,8 +231,8 @@ int fsg_lun_open(struct fsg_lun *curlun, const char *filename) if (curlun->cdrom) { blksize = 2048; blkbits = 11; - } else if (inode->i_bdev) { - blksize = bdev_logical_block_size(inode->i_bdev); + } else if (S_ISBLK(inode->i_mode)) { + blksize = bdev_logical_block_size(I_BDEV(inode)); blkbits = blksize_bits(blksize); } else { blksize = 512; diff --git a/fs/block_dev.c b/fs/block_dev.c index 2b8c0586314f..6d6e4d50834c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -883,7 +883,6 @@ static struct block_device *bdget(dev_t dev) bdev->bd_dev = dev; inode->i_mode = S_IFBLK; inode->i_rdev = dev; - inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); unlock_new_inode(inode); @@ -928,67 +927,8 @@ void bdput(struct block_device *bdev) { iput(bdev->bd_inode); } - EXPORT_SYMBOL(bdput); -static struct block_device *bd_acquire(struct inode *inode) -{ - struct block_device *bdev; - - spin_lock(&bdev_lock); - bdev = inode->i_bdev; - if (bdev && !inode_unhashed(bdev->bd_inode)) { - bdgrab(bdev); - spin_unlock(&bdev_lock); - return bdev; - } - spin_unlock(&bdev_lock); - - /* - * i_bdev references block device inode that was already shut down - * (corresponding device got removed). Remove the reference and look - * up block device inode again just in case new device got - * reestablished under the same device number. - */ - if (bdev) - bd_forget(inode); - - bdev = bdget(inode->i_rdev); - if (bdev) { - spin_lock(&bdev_lock); - if (!inode->i_bdev) { - /* - * We take an additional reference to bd_inode, - * and it's released in clear_inode() of inode. - * So, we can access it via ->i_mapping always - * without igrab(). - */ - bdgrab(bdev); - inode->i_bdev = bdev; - inode->i_mapping = bdev->bd_inode->i_mapping; - } - spin_unlock(&bdev_lock); - } - return bdev; -} - -/* Call when you free inode */ - -void bd_forget(struct inode *inode) -{ - struct block_device *bdev = NULL; - - spin_lock(&bdev_lock); - if (!sb_is_blkdev_sb(inode->i_sb)) - bdev = inode->i_bdev; - inode->i_bdev = NULL; - inode->i_mapping = &inode->i_data; - spin_unlock(&bdev_lock); - - if (bdev) - bdput(bdev); -} - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1497,38 +1437,45 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, } /** - * blkdev_get - open a block device - * @bdev: block_device to open + * blkdev_get_by_dev - open a block device by device number + * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is - * open with exclusive access. Specifying %FMODE_EXCL with %NULL - * @holder is invalid. Exclusive opens may nest for the same @holder. + * Open the block device described by device number @dev. If @mode includes + * %FMODE_EXCL, the block device is opened with exclusive access. Specifying + * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for + * the same @holder. * - * On success, the reference count of @bdev is unchanged. On failure, - * @bdev is put. + * Use this interface ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a device + * number. Everything else should use blkdev_get_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * 0 on success, -errno on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) +struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { struct block_device *claiming; bool unblock_events = true; + struct block_device *bdev; struct gendisk *disk; int partno; int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - imajor(bdev->bd_inode), iminor(bdev->bd_inode), + MAJOR(dev), MINOR(dev), ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - goto bdput; + return ERR_PTR(ret); + + bdev = bdget(dev); + if (!bdev) + return ERR_PTR(-ENOMEM); /* * If we lost a race with 'disk' being deleted, try again. See md.c. @@ -1589,10 +1536,13 @@ put_disk: if (ret == -ERESTARTSYS) goto retry; bdput: - if (ret) + if (ret) { bdput(bdev); - return ret; + return ERR_PTR(ret); + } + return bdev; } +EXPORT_SYMBOL(blkdev_get_by_dev); /** * blkdev_get_by_path - open a block device by name @@ -1600,32 +1550,30 @@ bdput: * @mode: FMODE_* mask * @holder: exclusive holder identifier * - * Open the blockdevice described by the device file at @path. @mode - * and @holder are identical to blkdev_get(). - * - * On success, the returned block_device has reference count of one. + * Open the block device described by the device file at @path. If @mode + * includes %FMODE_EXCL, the block device is opened with exclusive access. + * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may + * nest for the same @holder. * * CONTEXT: * Might sleep. * * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. + * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; - int err; - - bdev = lookup_bdev(path); - if (IS_ERR(bdev)) - return bdev; + dev_t dev; + int error; - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); - if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { + bdev = blkdev_get_by_dev(dev, mode, holder); + if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); } @@ -1634,45 +1582,6 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -/** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the blockdevice described by device number @dev. @mode and - * @holder are identical to blkdev_get(). - * - * Use it ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a - * device number. _Never_ to be used for internal purposes. If you - * ever need it - reconsider your API. - * - * On success, the returned block_device has reference count of one. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Pointer to block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) -{ - struct block_device *bdev; - int err; - - bdev = bdget(dev); - if (!bdev) - return ERR_PTR(-ENOMEM); - - err = blkdev_get(bdev, mode, holder); - if (err) - return ERR_PTR(err); - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_dev); - static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; @@ -1694,14 +1603,12 @@ static int blkdev_open(struct inode * inode, struct file * filp) if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; - bdev = bd_acquire(inode); - if (bdev == NULL) - return -ENOMEM; - + bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - - return blkdev_get(bdev, filp->f_mode, filp); + return 0; } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) @@ -2010,37 +1917,32 @@ const struct file_operations def_blk_fops = { * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -struct block_device *lookup_bdev(const char *pathname) +int lookup_bdev(const char *pathname, dev_t *dev) { - struct block_device *bdev; struct inode *inode; struct path path; int error; if (!pathname || !*pathname) - return ERR_PTR(-EINVAL); + return -EINVAL; error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) - return ERR_PTR(error); + return error; inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) - goto fail; + goto out_path_put; error = -EACCES; if (!may_open_dev(&path)) - goto fail; - error = -ENOMEM; - bdev = bd_acquire(inode); - if (!bdev) - goto fail; -out: + goto out_path_put; + + *dev = inode->i_rdev; + error = 0; +out_path_put: path_put(&path); - return bdev; -fail: - bdev = ERR_PTR(error); - goto out; + return error; } EXPORT_SYMBOL(lookup_bdev); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a6406b3b8c2b..fbc4b58228f7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -929,16 +929,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, * make sure it's the same device if the device is mounted */ if (device->bdev) { - struct block_device *path_bdev; + int error; + dev_t path_dev; - path_bdev = lookup_bdev(path); - if (IS_ERR(path_bdev)) { + error = lookup_bdev(path, &path_dev); + if (error) { mutex_unlock(&fs_devices->device_list_mutex); - return ERR_CAST(path_bdev); + return ERR_PTR(error); } - if (device->bdev != path_bdev) { - bdput(path_bdev); + if (device->bdev->bd_dev != path_dev) { mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, "duplicate device %s devid %llu generation %llu scanned by %s (%d)", @@ -947,7 +947,6 @@ static noinline struct btrfs_device *device_list_add(const char *path, task_pid_nr(current)); return ERR_PTR(-EEXIST); } - bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, "devid %llu device path %s changed to %s scanned by %s (%d)", devid, rcu_str_deref(device->name), diff --git a/fs/inode.c b/fs/inode.c index 9d78c37b00b8..cb008acf0efd 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -155,7 +155,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bytes = 0; inode->i_generation = 0; inode->i_pipe = NULL; - inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; inode->i_dir_seq = 0; @@ -580,8 +579,6 @@ static void evict(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); } - if (S_ISBLK(inode->i_mode) && inode->i_bdev) - bd_forget(inode); if (S_ISCHR(inode->i_mode) && inode->i_cdev) cd_forget(inode); diff --git a/fs/internal.h b/fs/internal.h index 47be21dfeebe..53f890446e75 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -25,7 +25,6 @@ extern void __init bdev_cache_init(void); extern int __sync_blockdev(struct block_device *bdev, int wait); void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); -void bd_forget(struct inode *inode); #else static inline void bdev_cache_init(void) { @@ -43,9 +42,6 @@ static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; } -static inline void bd_forget(struct inode *inode) -{ -} #endif /* CONFIG_BLOCK */ /* @@ -114,8 +110,7 @@ extern struct file *alloc_empty_file_noaccount(int, const struct cred *); */ extern int reconfigure_super(struct fs_context *); extern bool trylock_super(struct super_block *sb); -struct super_block *__get_super(struct block_device *bdev, bool excl); -extern struct super_block *user_get_super(dev_t); +struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ead291b2976..8f13c0417f94 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2716,11 +2716,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd) static bool io_bdev_nowait(struct block_device *bdev) { -#ifdef CONFIG_BLOCK return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); -#else - return true; -#endif } /* @@ -2733,14 +2729,16 @@ static bool io_file_supports_async(struct file *file, int rw) umode_t mode = file_inode(file)->i_mode; if (S_ISBLK(mode)) { - if (io_bdev_nowait(file->f_inode->i_bdev)) + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(I_BDEV(file->f_mapping->host))) return true; return false; } if (S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode)) { - if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) && + if (IS_ENABLED(CONFIG_BLOCK) && + io_bdev_nowait(file->f_inode->i_sb->s_bdev) && file->f_op != &io_uring_fops) return true; return false; diff --git a/fs/pipe.c b/fs/pipe.c index 0ac197658a2d..c5989cfd564d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1342,9 +1342,8 @@ out_revert_acct: } /* - * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same - * location, so checking ->i_pipe is not enough to verify that this is a - * pipe. + * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is + * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { diff --git a/fs/quota/quota.c b/fs/quota/quota.c index f3d32b0d9008..6d16b2be5ac4 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -866,17 +866,18 @@ static bool quotactl_cmd_onoff(int cmd) static struct super_block *quotactl_block(const char __user *special, int cmd) { #ifdef CONFIG_BLOCK - struct block_device *bdev; struct super_block *sb; struct filename *tmp = getname(special); bool excl = false, thawed = false; + int error; + dev_t dev; if (IS_ERR(tmp)) return ERR_CAST(tmp); - bdev = lookup_bdev(tmp->name); + error = lookup_bdev(tmp->name, &dev); putname(tmp); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + if (error) + return ERR_PTR(error); if (quotactl_cmd_onoff(cmd)) { excl = true; @@ -886,8 +887,10 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) } retry: - sb = __get_super(bdev, excl); - if (thawed && sb && sb->s_writers.frozen != SB_UNFROZEN) { + sb = user_get_super(dev, excl); + if (!sb) + return ERR_PTR(-ENODEV); + if (thawed && sb->s_writers.frozen != SB_UNFROZEN) { if (excl) up_write(&sb->s_umount); else @@ -897,10 +900,6 @@ retry: put_super(sb); goto retry; } - - bdput(bdev); - if (!sb) - return ERR_PTR(-ENODEV); return sb; #else diff --git a/fs/statfs.c b/fs/statfs.c index 59f33752c131..68cb07788750 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -235,7 +235,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { - struct super_block *s = user_get_super(dev); + struct super_block *s = user_get_super(dev, false); int err; if (!s) return -EINVAL; diff --git a/fs/super.c b/fs/super.c index 343e5c1e538d..2c6cdea2ab2d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -740,7 +740,14 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -struct super_block *__get_super(struct block_device *bdev, bool excl) +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ +struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; @@ -755,17 +762,11 @@ rescan: if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); - if (!excl) - down_read(&sb->s_umount); - else - down_write(&sb->s_umount); + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - if (!excl) - up_read(&sb->s_umount); - else - up_write(&sb->s_umount); + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -776,19 +777,6 @@ rescan: return NULL; } -/** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - return __get_super(bdev, false); -} -EXPORT_SYMBOL(get_super); - /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for @@ -820,7 +808,7 @@ restart: return NULL; } -struct super_block *user_get_super(dev_t dev) +struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; @@ -832,11 +820,17 @@ rescan: if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; - up_read(&sb->s_umount); + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12810a19edeb..bdd7339bcda4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1973,7 +1973,7 @@ int bdev_read_only(struct block_device *bdev); int set_blocksize(struct block_device *bdev, int size); const char *bdevname(struct block_device *bdev, char *buffer); -struct block_device *lookup_bdev(const char *); +int lookup_bdev(const char *pathname, dev_t *dev); void blkdev_show(struct seq_file *seqf, off_t offset); diff --git a/include/linux/fs.h b/include/linux/fs.h index a61df0dd4f19..b0b358309657 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -696,7 +696,6 @@ struct inode { struct list_head i_devices; union { struct pipe_inode_info *i_pipe; - struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; -- cgit v1.2.3 From 22ae8ce8b89241c94ac00c237752c0ffa37ba5ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 09:23:26 +0100 Subject: block: simplify bdev/disk lookup in blkdev_get To simplify block device lookup and a few other upcoming areas, make sure that we always have a struct block_device available for each disk and each partition, and only find existing block devices in bdget. The only downside of this is that each device and partition uses a little more memory. The upside will be that a lot of code can be simplified. With that all we need to look up the block device is to lookup the inode and do a few sanity checks on the gendisk, instead of the separate lookup for the gendisk. For blk-cgroup which wants to access a gendisk without opening it, a new blkdev_{get,put}_no_open low-level interface is added to replace the previous get_gendisk use. Note that the change to look up block device directly instead of the two step lookup using struct gendisk causes a subtile change in behavior: accessing a non-existing partition on an existing block device can now cause a call to request_module. That call is harmless, and in practice no recent system will access these nodes as they aren't created by udev and static /dev/ setups are unusual. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 42 ++++----- block/blk-iocost.c | 36 ++++---- block/blk.h | 2 +- block/genhd.c | 210 ++++++--------------------------------------- block/partitions/core.c | 29 ++++--- fs/block_dev.c | 177 ++++++++++++++++++++++---------------- include/linux/blk-cgroup.h | 4 +- include/linux/blkdev.h | 6 ++ include/linux/genhd.h | 7 +- 9 files changed, 194 insertions(+), 319 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c68bdf58c9a6..ad02289a4f7f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -556,22 +556,22 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, } /** - * blkg_conf_prep - parse and prepare for per-blkg config update + * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching gendisk. *@inputp is + * from @input and get and return the matching bdev. *@inputp is * updated to point past the device node prefix. Returns an ERR_PTR() * value on error. * * Use this function iff blkg_conf_prep() can't be used for some reason. */ -struct gendisk *blkcg_conf_get_disk(char **inputp) +struct block_device *blkcg_conf_open_bdev(char **inputp) { char *input = *inputp; unsigned int major, minor; - struct gendisk *disk; - int key_len, part; + struct block_device *bdev; + int key_len; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return ERR_PTR(-EINVAL); @@ -581,16 +581,16 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) return ERR_PTR(-EINVAL); input = skip_spaces(input); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) return ERR_PTR(-ENODEV); - if (part) { - put_disk_and_module(disk); + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); return ERR_PTR(-ENODEV); } *inputp = input; - return disk; + return bdev; } /** @@ -607,18 +607,18 @@ struct gendisk *blkcg_conf_get_disk(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) { - struct gendisk *disk; + struct block_device *bdev; struct request_queue *q; struct blkcg_gq *blkg; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - q = disk->queue; + q = bdev->bd_disk->queue; rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -689,7 +689,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, goto success; } success: - ctx->disk = disk; + ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; return 0; @@ -700,7 +700,7 @@ fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -723,11 +723,11 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); rcu_read_unlock(); - put_disk_and_module(ctx->disk); + blkdev_put_no_open(ctx->bdev); } EXPORT_SYMBOL_GPL(blkg_conf_finish); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index bbe86d1199dc..8e20fe4bddec 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -3120,23 +3120,23 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3231,12 +3231,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } @@ -3287,23 +3287,23 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct block_device *bdev; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; char *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + bdev = blkcg_conf_open_bdev(&input); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(bdev->bd_disk->queue); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(bdev->bd_disk->queue); } spin_lock_irq(&ioc->lock); @@ -3356,13 +3356,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return nbytes; einval: ret = -EINVAL; err: - put_disk_and_module(disk); + blkdev_put_no_open(bdev); return ret; } diff --git a/block/blk.h b/block/blk.h index dfab98465db9..c4839abcfa27 100644 --- a/block/blk.h +++ b/block/blk.h @@ -352,7 +352,6 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); -void blk_invalidate_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 @@ -384,6 +383,7 @@ static inline void hd_free_part(struct hd_struct *part) { free_percpu(part->dkstats); kfree(part->info); + bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index f46e89226fdf..bf8fa82f135f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -27,17 +27,11 @@ static struct kobject *block_depr; -static DEFINE_XARRAY(bdev_map); -static DEFINE_MUTEX(bdev_map_lock); +DECLARE_RWSEM(bdev_lookup_sem); /* for extended dynamic devt allocation, currently only one major is used */ #define NR_EXT_DEVT (1 << MINORBITS) - -/* For extended devt allocation. ext_devt_lock prevents look up - * results from going away underneath its user. - */ -static DEFINE_SPINLOCK(ext_devt_lock); -static DEFINE_IDR(ext_devt_idr); +static DEFINE_IDA(ext_devt_ida); static void disk_check_events(struct disk_events *ev, unsigned int *clearing_ptr); @@ -580,14 +574,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return 0; } - /* allocate ext devt */ - idr_preload(GFP_KERNEL); - - spin_lock_bh(&ext_devt_lock); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); - spin_unlock_bh(&ext_devt_lock); - - idr_preload_end(); + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT, GFP_KERNEL); if (idx < 0) return idx == -ENOSPC ? -EBUSY : idx; @@ -606,26 +593,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) */ void blk_free_devt(dev_t devt) { - if (devt == MKDEV(0, 0)) - return; - - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } -} - -/* - * We invalidate devt by assigning NULL pointer for devt in idr. - */ -void blk_invalidate_devt(dev_t devt) -{ - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } + if (MAJOR(devt) == BLOCK_EXT_MAJOR) + ida_free(&ext_devt_ida, blk_mangle_minor(MINOR(devt))); } static char *bdevt_str(dev_t devt, char *buf) @@ -640,28 +609,6 @@ static char *bdevt_str(dev_t devt, char *buf) return buf; } -static void blk_register_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) { - if (xa_insert(&bdev_map, disk_devt(disk) + i, disk, GFP_KERNEL)) - WARN_ON_ONCE(1); - } - mutex_unlock(&bdev_map_lock); -} - -static void blk_unregister_region(struct gendisk *disk) -{ - int i; - - mutex_lock(&bdev_map_lock); - for (i = 0; i < disk->minors; i++) - xa_erase(&bdev_map, disk_devt(disk) + i); - mutex_unlock(&bdev_map_lock); -} - static void disk_scan_partitions(struct gendisk *disk) { struct block_device *bdev; @@ -805,7 +752,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - blk_register_region(disk); + bdev_add(disk->part0.bdev, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -847,8 +794,8 @@ static void invalidate_partition(struct gendisk *disk, int partno) __invalidate_device(bdev, true); /* - * Unhash the bdev inode for this device so that it gets evicted as soon - * as last inode reference is dropped. + * Unhash the bdev inode for this device so that it can't be looked + * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); bdput(bdev); @@ -890,7 +837,8 @@ void del_gendisk(struct gendisk *disk) * Block lookups of the disk until all bdevs are unhashed and the * disk is marked as dead (GENHD_FL_UP cleared). */ - down_write(&disk->lookup_sem); + down_write(&bdev_lookup_sem); + /* invalidate stuff */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); @@ -903,7 +851,7 @@ void del_gendisk(struct gendisk *disk) invalidate_partition(disk, 0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; - up_write(&disk->lookup_sem); + up_write(&bdev_lookup_sem); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -916,16 +864,6 @@ void del_gendisk(struct gendisk *disk) } blk_unregister_queue(disk); - - if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(disk_devt(disk)); kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); @@ -964,7 +902,7 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } -static void request_gendisk_module(dev_t devt) +void blk_request_module(dev_t devt) { unsigned int major = MAJOR(devt); struct blk_major_name **n; @@ -984,84 +922,6 @@ static void request_gendisk_module(dev_t devt) request_module("block-major-%d", MAJOR(devt)); } -static bool get_disk_and_module(struct gendisk *disk) -{ - struct module *owner; - - if (!disk->fops) - return false; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return false; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) { - module_put(owner); - return false; - } - return true; - -} - -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - * - * Context: can sleep - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) -{ - struct gendisk *disk = NULL; - - might_sleep(); - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - if (!disk) { - mutex_unlock(&bdev_map_lock); - request_gendisk_module(devt); - mutex_lock(&bdev_map_lock); - disk = xa_load(&bdev_map, devt); - } - if (disk && !get_disk_and_module(disk)) - disk = NULL; - if (disk) - *partno = devt - disk_devt(disk); - mutex_unlock(&bdev_map_lock); - } else { - struct hd_struct *part; - - spin_lock_bh(&ext_devt_lock); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk_and_module(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); - } - spin_unlock_bh(&ext_devt_lock); - } - - if (!disk) - return NULL; - - /* - * Synchronize with del_gendisk() to not return disk that is being - * destroyed. - */ - down_read(&disk->lookup_sem); - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || - !(disk->flags & GENHD_FL_UP))) { - up_read(&disk->lookup_sem); - put_disk_and_module(disk); - disk = NULL; - } else { - up_read(&disk->lookup_sem); - } - return disk; -} - /** * bdget_disk - do bdget() by gendisk and partition number * @disk: gendisk of interest @@ -1559,11 +1419,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) * * This function releases all allocated resources of the gendisk. * - * The struct gendisk refcount is incremented with get_gendisk() or - * get_disk_and_module(), and its refcount is decremented with - * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this - * function is called. - * * Drivers which used __device_add_disk() have a gendisk with a request_queue * assigned. Since the request_queue sits on top of the gendisk for these * drivers we also call blk_put_queue() for them, and we expect the @@ -1748,16 +1603,17 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; + disk->part0.bdev = bdev_alloc(disk, 0); + if (!disk->part0.bdev) + goto out_free_disk; + disk->part0.dkstats = alloc_percpu(struct disk_stats); if (!disk->part0.dkstats) - goto out_free_disk; + goto out_bdput; - init_rwsem(&disk->lookup_sem); disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - goto out_free_disk; - } + if (disk_expand_part_tbl(disk, 0)) + goto out_free_bdstats; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); @@ -1773,7 +1629,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) */ hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_part0; + goto out_free_bdstats; disk->minors = minors; rand_initialize_disk(disk); @@ -1782,8 +1638,10 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_part0: - hd_free_part(&disk->part0); +out_free_bdstats: + free_percpu(disk->part0.dkstats); +out_bdput: + bdput(disk->part0.bdev); out_free_disk: kfree(disk); return NULL; @@ -1807,26 +1665,6 @@ void put_disk(struct gendisk *disk) } EXPORT_SYMBOL(put_disk); -/** - * put_disk_and_module - decrements the module and gendisk refcount - * @disk: the struct gendisk to decrement the refcount for - * - * This is a counterpart of get_disk_and_module() and thus also of - * get_gendisk(). - * - * Context: Any context, but the last reference must not be dropped from - * atomic context. - */ -void put_disk_and_module(struct gendisk *disk) -{ - if (disk) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - } -} - static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; diff --git a/block/partitions/core.c b/block/partitions/core.c index a02e22411594..696bd9ff63c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -340,12 +340,11 @@ void delete_partition(struct hd_struct *part) device_del(part_to_dev(part)); /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. + * Remove the block device from the inode hash, so that it cannot be + * looked up any more even when openers still hold references. */ - blk_invalidate_devt(part_devt(part)); + remove_inode_hash(part->bdev->bd_inode); + percpu_ref_kill(&part->ref); } @@ -368,6 +367,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; + struct block_device *bdev; struct disk_part_tbl *ptbl; const char *dname; int err; @@ -402,11 +402,15 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); + err = -ENOMEM; p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) { - err = -ENOMEM; + if (!p->dkstats) goto out_free; - } + + bdev = bdev_alloc(disk, partno); + if (!bdev) + goto out_free_stats; + p->bdev = bdev; hd_sects_seq_init(p); pdev = part_to_dev(p); @@ -420,10 +424,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, struct partition_meta_info *pinfo; pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } + if (!pinfo) + goto out_bdput; memcpy(pinfo, info, sizeof(*info)); p->info = pinfo; } @@ -470,6 +472,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ + bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk suppresses it */ @@ -479,6 +482,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_free_info: kfree(p->info); +out_bdput: + bdput(bdev); out_free_stats: free_percpu(p->dkstats); out_free: diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d6e4d50834c..b350ed3af83b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -863,31 +863,46 @@ void __init bdev_cache_init(void) blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } -static struct block_device *bdget(dev_t dev) +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) { struct block_device *bdev; struct inode *inode; - inode = iget_locked(blockdev_superblock, dev); + inode = new_inode(blockdev_superblock); if (!inode) return NULL; + inode->i_mode = S_IFBLK; + inode->i_rdev = 0; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + + bdev = I_BDEV(inode); + spin_lock_init(&bdev->bd_size_lock); + bdev->bd_disk = disk; + bdev->bd_partno = partno; + bdev->bd_contains = NULL; + bdev->bd_super = NULL; + bdev->bd_inode = inode; + bdev->bd_part_count = 0; + return bdev; +} - bdev = &BDEV_I(inode)->bdev; +void bdev_add(struct block_device *bdev, dev_t dev) +{ + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; + insert_inode_hash(bdev->bd_inode); +} - if (inode->i_state & I_NEW) { - spin_lock_init(&bdev->bd_size_lock); - bdev->bd_contains = NULL; - bdev->bd_super = NULL; - bdev->bd_inode = inode; - bdev->bd_part_count = 0; - bdev->bd_dev = dev; - inode->i_mode = S_IFBLK; - inode->i_rdev = dev; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - unlock_new_inode(inode); - } - return bdev; +static struct block_device *bdget(dev_t dev) +{ + struct inode *inode; + + inode = ilookup(blockdev_superblock, dev); + if (!inode) + return NULL; + return &BDEV_I(inode)->bdev; } /** @@ -1004,27 +1019,6 @@ retry: } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ -static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) -{ - struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); - - if (!disk) - return NULL; - /* - * Now that we hold gendisk reference we make sure bdev we looked up is - * not stale. If it is, it means device got removed and created before - * we looked up gendisk and we fail open in such case. Associating - * unhashed bdev with newly created gendisk could lead to two bdevs - * (and thus two independent caches) being associated with one device - * which is bad. - */ - if (inode_unhashed(bdev->bd_inode)) { - put_disk_and_module(disk); - return NULL; - } - return disk; -} - static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); @@ -1347,19 +1341,17 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ -static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, - int partno, fmode_t mode) +static int __blkdev_get(struct block_device *bdev, fmode_t mode) { + struct gendisk *disk = bdev->bd_disk; int ret; if (!bdev->bd_openers) { - bdev->bd_disk = disk; bdev->bd_contains = bdev; - bdev->bd_partno = partno; - if (!partno) { + if (!bdev->bd_partno) { ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, partno); + bdev->bd_part = disk_get_part(disk, 0); if (!bdev->bd_part) goto out_clear; @@ -1388,7 +1380,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, struct block_device *whole = bdget_disk(disk, 0); mutex_lock_nested(&whole->bd_mutex, 1); - ret = __blkdev_get(whole, disk, 0, mode); + ret = __blkdev_get(whole, mode); if (ret) { mutex_unlock(&whole->bd_mutex); bdput(whole); @@ -1398,7 +1390,7 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, mutex_unlock(&whole->bd_mutex); bdev->bd_contains = whole; - bdev->bd_part = disk_get_part(disk, partno); + bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { __blkdev_put(whole, mode, 1); @@ -1430,12 +1422,53 @@ static int __blkdev_get(struct block_device *bdev, struct gendisk *disk, out_clear: disk_put_part(bdev->bd_part); - bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_contains = NULL; return ret; } +struct block_device *blkdev_get_no_open(dev_t dev) +{ + struct block_device *bdev; + struct gendisk *disk; + + down_read(&bdev_lookup_sem); + bdev = bdget(dev); + if (!bdev) { + up_read(&bdev_lookup_sem); + blk_request_module(dev); + down_read(&bdev_lookup_sem); + + bdev = bdget(dev); + if (!bdev) + goto unlock; + } + + disk = bdev->bd_disk; + if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) + goto bdput; + if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + goto put_disk; + if (!try_module_get(bdev->bd_disk->fops->owner)) + goto put_disk; + up_read(&bdev_lookup_sem); + return bdev; +put_disk: + put_disk(disk); +bdput: + bdput(bdev); +unlock: + up_read(&bdev_lookup_sem); + return NULL; +} + +void blkdev_put_no_open(struct block_device *bdev) +{ + module_put(bdev->bd_disk->fops->owner); + put_disk(bdev->bd_disk); + bdput(bdev); +} + /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open @@ -1463,7 +1496,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) bool unblock_events = true; struct block_device *bdev; struct gendisk *disk; - int partno; int ret; ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, @@ -1473,18 +1505,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - bdev = bdget(dev); - if (!bdev) - return ERR_PTR(-ENOMEM); - /* * If we lost a race with 'disk' being deleted, try again. See md.c. */ retry: - ret = -ENXIO; - disk = bdev_get_gendisk(bdev, &partno); - if (!disk) - goto bdput; + bdev = blkdev_get_no_open(dev); + if (!bdev) + return ERR_PTR(-ENXIO); + disk = bdev->bd_disk; if (mode & FMODE_EXCL) { WARN_ON_ONCE(!holder); @@ -1492,7 +1520,7 @@ retry: ret = -ENOMEM; claiming = bdget_disk(disk, 0); if (!claiming) - goto put_disk; + goto put_blkdev; ret = bd_prepare_to_claim(bdev, claiming, holder); if (ret) goto put_claiming; @@ -1501,12 +1529,10 @@ retry: disk_block_events(disk); mutex_lock(&bdev->bd_mutex); - ret =__blkdev_get(bdev, disk, partno, mode); - if (!(mode & FMODE_EXCL)) { - ; /* nothing to do here */ - } else if (ret) { - bd_abort_claiming(bdev, claiming, holder); - } else { + ret =__blkdev_get(bdev, mode); + if (ret) + goto abort_claiming; + if (mode & FMODE_EXCL) { bd_finish_claiming(bdev, claiming, holder); /* @@ -1526,21 +1552,23 @@ retry: if (unblock_events) disk_unblock_events(disk); + if (mode & FMODE_EXCL) + bdput(claiming); + return bdev; +abort_claiming: + if (mode & FMODE_EXCL) + bd_abort_claiming(bdev, claiming, holder); + mutex_unlock(&bdev->bd_mutex); + disk_unblock_events(disk); put_claiming: if (mode & FMODE_EXCL) bdput(claiming); -put_disk: - if (ret) - put_disk_and_module(disk); +put_blkdev: + blkdev_put_no_open(bdev); if (ret == -ERESTARTSYS) goto retry; -bdput: - if (ret) { - bdput(bdev); - return ERR_PTR(ret); - } - return bdev; + return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); @@ -1641,7 +1669,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk_put_part(bdev->bd_part); bdev->bd_part = NULL; - bdev->bd_disk = NULL; if (bdev_is_partition(bdev)) victim = bdev->bd_contains; bdev->bd_contains = NULL; @@ -1699,12 +1726,10 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) * from userland - e.g. eject(1). */ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); - bdput(bdev); - put_disk_and_module(disk); + blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index c8fc9792ac77..b9f3c246c3c9 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -197,12 +197,12 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); struct blkg_conf_ctx { - struct gendisk *disk; + struct block_device *bdev; struct blkcg_gq *blkg; char *body; }; -struct gendisk *blkcg_conf_get_disk(char **inputp); +struct block_device *blkcg_conf_open_bdev(char **inputp); int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx); void blkg_conf_finish(struct blkg_conf_ctx *ctx); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bdd7339bcda4..5d48b92f5e43 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1994,6 +1994,12 @@ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder); void blkdev_put(struct block_device *bdev, fmode_t mode); +/* just for blk-cgroup, don't use elsewhere */ +struct block_device *blkdev_get_no_open(dev_t dev); +void blkdev_put_no_open(struct block_device *bdev); + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); +void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index ca5e356084c3..42a51653c730 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -65,6 +65,7 @@ struct hd_struct { struct disk_stats __percpu *dkstats; struct percpu_ref ref; + struct block_device *bdev; struct device __dev; struct kobject *holder_dir; int policy, partno; @@ -193,7 +194,6 @@ struct gendisk { int flags; unsigned long state; #define GD_NEED_PART_SCAN 0 - struct rw_semaphore lookup_sem; struct kobject *slave_dir; struct timer_rand_state *random; @@ -300,7 +300,6 @@ static inline void add_disk_no_queue_reg(struct gendisk *disk) } extern void del_gendisk(struct gendisk *gp); -extern struct gendisk *get_gendisk(dev_t dev, int *partno); extern struct block_device *bdget_disk(struct gendisk *disk, int partno); extern void set_disk_ro(struct gendisk *disk, int flag); @@ -338,7 +337,6 @@ int blk_drop_partitions(struct block_device *bdev); extern struct gendisk *__alloc_disk_node(int minors, int node_id); extern void put_disk(struct gendisk *disk); -extern void put_disk_and_module(struct gendisk *disk); #define alloc_disk_node(minors, node_id) \ ({ \ @@ -388,7 +386,10 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev, } #endif /* CONFIG_SYSFS */ +extern struct rw_semaphore bdev_lookup_sem; + dev_t blk_lookup_devt(const char *name, int partno); +void blk_request_module(dev_t devt); #ifdef CONFIG_BLOCK void printk_all_partitions(void); #else /* CONFIG_BLOCK */ -- cgit v1.2.3 From a782483cc1f875355690625d8253a232f2581418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:43:37 +0100 Subject: block: remove the nr_sects field in struct hd_struct Now that the hd_struct always has a block device attached to it, there is no need for having two size field that just get out of sync. Additionally the field in hd_struct did not use proper serialization, possibly allowing for torn writes. By only using the block_device field this problem also gets fixed. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +-- block/blk-core.c | 2 +- block/blk.h | 53 --------------------------- block/genhd.c | 59 +++++++++++++++++------------- block/partitions/core.c | 17 +++++---- drivers/block/loop.c | 1 - drivers/block/nbd.c | 2 +- drivers/block/xen-blkback/common.h | 4 +-- drivers/md/bcache/super.c | 2 +- drivers/s390/block/dasd_ioctl.c | 4 +-- drivers/target/target_core_pscsi.c | 5 ++- fs/block_dev.c | 73 ++------------------------------------ fs/f2fs/super.c | 2 +- fs/pstore/blk.c | 2 +- include/linux/genhd.h | 29 ++++----------- kernel/trace/blktrace.c | 2 +- 16 files changed, 67 insertions(+), 194 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index fa01bef35bb1..669bb47a3198 100644 --- a/block/bio.c +++ b/block/bio.c @@ -613,8 +613,8 @@ void guard_bio_eod(struct bio *bio) rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = part_nr_sects_read(part); - else + maxsector = bdev_nr_sectors(part->bdev); + else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 2db8bda43b6e..988f45094a38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -755,7 +755,7 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) + if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), diff --git a/block/blk.h b/block/blk.h index c4839abcfa27..09cee7024fb4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -387,59 +387,6 @@ static inline void hd_free_part(struct hd_struct *part) percpu_ref_exit(&part->ref); } -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/genhd.c b/block/genhd.c index bf8fa82f135f..c65f485b9db5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -40,6 +40,16 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + struct block_device *bdev = disk->part0.bdev; + + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} +EXPORT_SYMBOL(set_capacity); + /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. @@ -47,18 +57,30 @@ static void disk_release_events(struct gendisk *disk); bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - revalidate_disk_size(disk, true); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + return false; - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - return true; - } + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, size, capacity); - return false; + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); @@ -247,7 +269,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part_nr_sects_read(part) && + if (!bdev_nr_sectors(part->bdev) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -284,7 +306,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); + sector < part->start_sect + bdev_nr_sectors(part->bdev); } /** @@ -986,8 +1008,8 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), + bdev_nr_sectors(part->bdev) >> 1, + disk_name(disk, part->partno, name_buf), part->info ? part->info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) @@ -1079,7 +1101,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, + bdev_nr_sectors(part->bdev) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1161,8 +1183,7 @@ ssize_t part_size_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); } ssize_t part_stat_show(struct device *dev, @@ -1618,16 +1639,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) goto out_free_bdstats; diff --git a/block/partitions/core.c b/block/partitions/core.c index 696bd9ff63c6..bcfa8215bd5e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -85,6 +85,13 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; +static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} + static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; @@ -295,7 +302,7 @@ static void hd_struct_free_work(struct work_struct *work) put_device(disk_to_dev(disk)); part->start_sect = 0; - part->nr_sects = 0; + bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); } @@ -412,11 +419,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free_stats; p->bdev = bdev; - hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; - p->nr_sects = len; + bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -509,7 +515,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || + start >= part->start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->start_sect) continue; overlap = true; @@ -600,8 +606,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, length); - bd_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(bdevp, length); ret = 0; out_unlock: diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d643c67be6ac..d2ce1ddc192d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1241,7 +1241,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { - bd_set_nr_sectors(bdev, 0); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 45b0423ef2c5..014683968ce1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1132,7 +1132,7 @@ static void nbd_bdev_reset(struct block_device *bdev) { if (bdev->bd_openers > 1) return; - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c6ea5d38c509..0762db247b41 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -358,9 +358,7 @@ struct pending_req { }; -#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ - (_v)->bdev->bd_part->nr_sects : \ - get_capacity((_v)->bdev->bd_disk)) +#define vbd_sz(_v) bdev_nr_sectors((_v)->bdev) #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) #define xen_blkif_put(_b) \ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c55d3c58a7ef..04fa40868fbe 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1408,7 +1408,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, dc->bdev, &bcache_cached_ops); if (ret) return ret; diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 3359559517bf..304eba1acf16 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -54,8 +54,6 @@ dasd_ioctl_enable(struct block_device *bdev) return -ENODEV; dasd_enable_device(base); - /* Formatting the dasd device can change the capacity. */ - bd_set_nr_sectors(bdev, get_capacity(base->block->gdp)); dasd_put_device(base); return 0; } @@ -88,7 +86,7 @@ dasd_ioctl_disable(struct block_device *bdev) * Set i_size to zero, since read, write, etc. check against this * value. */ - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); dasd_put_device(base); return 0; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 4e37fa9b409d..7994f27e4527 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1029,9 +1029,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) { struct pscsi_dev_virt *pdv = PSCSI_DEV(dev); - if (pdv->pdv_bd && pdv->pdv_bd->bd_part) - return pdv->pdv_bd->bd_part->nr_sects; - + if (pdv->pdv_bd) + return bdev_nr_sectors(pdv->pdv_bd); return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index a5b6955a841f..31ee5a857f71 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1208,70 +1208,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -/** - * check_disk_size_change - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @bdev: struct bdev to adjust. - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -static void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev, bool verbose) -{ - loff_t disk_size, bdev_size; - - spin_lock(&bdev->bd_size_lock); - disk_size = (loff_t)get_capacity(disk) << 9; - bdev_size = i_size_read(bdev->bd_inode); - if (disk_size != bdev_size) { - if (verbose) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); - } - i_size_write(bdev->bd_inode, disk_size); - } - spin_unlock(&bdev->bd_size_lock); -} - -/** - * revalidate_disk_size - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -void revalidate_disk_size(struct gendisk *disk, bool verbose) -{ - struct block_device *bdev; - - /* - * Hidden disks don't have associated bdev so there's no point in - * revalidating them. - */ - if (disk->flags & GENHD_FL_HIDDEN) - return; - - bdev = bdget_disk(disk, 0); - if (bdev) { - check_disk_size_change(disk, bdev, verbose); - bdput(bdev); - } -} - -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) -{ - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - spin_unlock(&bdev->bd_size_lock); -} -EXPORT_SYMBOL(bd_set_nr_sectors); - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) @@ -1305,8 +1241,6 @@ rescan: disk->fops->revalidate_disk(disk); } - check_disk_size_change(disk, bdev, !invalidate); - if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) @@ -1349,10 +1283,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (disk->fops->open) ret = disk->fops->open(bdev, mode); - if (!ret) { - bd_set_nr_sectors(bdev, get_capacity(disk)); + if (!ret) set_init_blocksize(bdev); - } /* * If the device is invalidated, rescan partition @@ -1381,13 +1313,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev->bd_part->nr_sects) { + !bdev->bd_part || !bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); ret = -ENXIO; goto out_clear; } - bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 00eff2f51807..d4e7fab352ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3151,7 +3151,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; - sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; int ret; diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..777a26f7bbe2 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -245,7 +245,7 @@ static struct block_device *psblk_get_bdev(void *holder, return bdev; } - nr_sects = part_nr_sects_read(bdev->bd_part); + nr_sects = bdev_nr_sectors(bdev); if (!nr_sects) { pr_err("not enough space for '%s'\n", blkdev); blkdev_put(bdev, mode); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6ba91ee54cb2..30d4785b7df8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -52,15 +52,6 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; - /* - * nr_sects is protected by sequence counter. One might extend a - * partition while IO is happening to it and update of nr_sects - * can be non-atomic on 32bit machines with 64bit sector_t. - */ - sector_t nr_sects; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_t nr_sects_seq; -#endif unsigned long stamp; struct disk_stats __percpu *dkstats; struct percpu_ref ref; @@ -254,13 +245,6 @@ static inline void disk_put_part(struct hd_struct *part) put_device(part_to_dev(part)); } -static inline void hd_sects_seq_init(struct hd_struct *p) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_init(&p->nr_sects_seq); -#endif -} - /* * Smarter partition iterator without context limits. */ @@ -318,13 +302,15 @@ static inline sector_t get_start_sect(struct block_device *bdev) { return bdev->bd_part->start_sect; } -static inline sector_t get_capacity(struct gendisk *disk) + +static inline sector_t bdev_nr_sectors(struct block_device *bdev) { - return disk->part0.nr_sects; + return i_size_read(bdev->bd_inode) >> 9; } -static inline void set_capacity(struct gendisk *disk, sector_t size) + +static inline sector_t get_capacity(struct gendisk *disk) { - disk->part0.nr_sects = size; + return bdev_nr_sectors(disk->part0.bdev); } int bdev_disk_changed(struct block_device *bdev, bool invalidate); @@ -358,10 +344,9 @@ int __register_blkdev(unsigned int major, const char *name, __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); -void revalidate_disk_size(struct gendisk *disk, bool verbose); bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); +void set_capacity(struct gendisk *disk, sector_t size); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..7076d588a50d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -465,7 +465,7 @@ static void blk_trace_setup_lba(struct blk_trace *bt, if (part) { bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; + bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; -- cgit v1.2.3 From 15e3d2c5cd53298272e59ad9072d3468f9dd3781 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:00 +0100 Subject: block: move disk stat accounting to struct block_device Move the dkstats and stamp field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 4 ++-- block/blk.h | 1 - block/genhd.c | 14 ++++---------- block/partitions/core.c | 9 +-------- fs/block_dev.c | 10 ++++++++++ include/linux/blk_types.h | 2 ++ include/linux/genhd.h | 2 -- include/linux/part_stat.h | 38 +++++++++++++++++++------------------- 9 files changed, 39 insertions(+), 43 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ad02289a4f7f..79aa96240cec 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk-core.c b/block/blk-core.c index 988f45094a38..d2c9cb24e087 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1264,9 +1264,9 @@ static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->stamp); + stamp = READ_ONCE(part->bdev->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { diff --git a/block/blk.h b/block/blk.h index 09cee7024fb4..3f801f6e86f8 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - free_percpu(part->dkstats); kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); diff --git a/block/genhd.c b/block/genhd.c index c65f485b9db5..2cbda8139556 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -112,7 +112,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -891,7 +891,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; + disk->part0.bdev->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1628,19 +1628,15 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk->part0.bdev) goto out_free_disk; - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) - goto out_bdput; - disk->node_id = node_id; if (disk_expand_part_tbl(disk, 0)) - goto out_free_bdstats; + goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); if (hd_ref_init(&disk->part0)) - goto out_free_bdstats; + goto out_bdput; disk->minors = minors; rand_initialize_disk(disk); @@ -1649,8 +1645,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) device_initialize(disk_to_dev(disk)); return disk; -out_free_bdstats: - free_percpu(disk->part0.dkstats); out_bdput: bdput(disk->part0.bdev); out_free_disk: diff --git a/block/partitions/core.c b/block/partitions/core.c index bcfa8215bd5e..8924e1ea8b2a 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -409,14 +409,9 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!p) return ERR_PTR(-EBUSY); - err = -ENOMEM; - p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) - goto out_free; - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free_stats; + goto out_free; p->bdev = bdev; pdev = part_to_dev(p); @@ -490,8 +485,6 @@ out_free_info: kfree(p->info); out_bdput: bdput(bdev); -out_free_stats: - free_percpu(p->dkstats); out_free: kfree(p); return ERR_PTR(err); diff --git a/fs/block_dev.c b/fs/block_dev.c index 31ee5a857f71..0832c7830f3a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -781,6 +782,10 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) static void bdev_free_inode(struct inode *inode) { + struct block_device *bdev = I_BDEV(inode); + + free_percpu(bdev->bd_stats); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -875,6 +880,11 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + bdev->bd_stats = alloc_percpu(struct disk_stats); + if (!bdev->bd_stats) { + iput(inode); + return NULL; + } return bdev; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2e0a9bd9688d..520011b95276 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,8 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { + struct disk_stats __percpu *bd_stats; + unsigned long bd_stamp; dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 30d4785b7df8..804ac45fbfbc 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -52,8 +52,6 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; - unsigned long stamp; - struct disk_stats __percpu *dkstats; struct percpu_ref ref; struct block_device *bdev; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 24125778ef3e..87ad60106e1d 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -25,17 +25,17 @@ struct disk_stats { #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->dkstats, (cpu))->field) + (per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ - typeof((part)->dkstats->field) res = 0; \ + typeof((part)->bdev->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->dkstats, _cpu)->field; \ + res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \ res; \ }) @@ -44,7 +44,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) int i; for_each_possible_cpu(i) - memset(per_cpu_ptr(part->dkstats, i), value, + memset(per_cpu_ptr(part->bdev->bd_stats, i), value, sizeof(struct disk_stats)); } @@ -54,7 +54,7 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - __this_cpu_add((part)->dkstats->field, addnd) + __this_cpu_add((part)->bdev->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ @@ -63,20 +63,20 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) field, addnd); \ } while (0) -#define part_stat_dec(gendiskp, field) \ - part_stat_add(gendiskp, field, -1) -#define part_stat_inc(gendiskp, field) \ - part_stat_add(gendiskp, field, 1) -#define part_stat_sub(gendiskp, field, subnd) \ - part_stat_add(gendiskp, field, -subnd) +#define part_stat_dec(part, field) \ + part_stat_add(part, field, -1) +#define part_stat_inc(part, field) \ + part_stat_add(part, field, 1) +#define part_stat_sub(part, field, subnd) \ + part_stat_add(part, field, -subnd) -#define part_stat_local_dec(gendiskp, field) \ - local_dec(&(part_stat_get(gendiskp, field))) -#define part_stat_local_inc(gendiskp, field) \ - local_inc(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read(gendiskp, field) \ - local_read(&(part_stat_get(gendiskp, field))) -#define part_stat_local_read_cpu(gendiskp, field, cpu) \ - local_read(&(part_stat_get_cpu(gendiskp, field, cpu))) +#define part_stat_local_dec(part, field) \ + local_dec(&(part_stat_get(part, field))) +#define part_stat_local_inc(part, field) \ + local_inc(&(part_stat_get(part, field))) +#define part_stat_local_read(part, field) \ + local_read(&(part_stat_get(part, field))) +#define part_stat_local_read_cpu(part, field, cpu) \ + local_read(&(part_stat_get_cpu(part, field, cpu))) #endif /* _LINUX_PART_STAT_H */ -- cgit v1.2.3 From 29ff57c61094e7bbd921ab10b5a99dce9a0132e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:24 +0100 Subject: block: move the start_sect field to struct block_device Move the start_sect field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/blk-lib.c | 2 +- block/genhd.c | 4 ++-- block/partitions/core.c | 17 +++++++++-------- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 4 ++-- include/linux/genhd.h | 3 +-- kernel/trace/blktrace.c | 11 +++-------- 8 files changed, 22 insertions(+), 25 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d2c9cb24e087..9a3793d5ce38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -757,9 +757,10 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_sectors(bio)) { if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; - bio->bi_iter.bi_sector += p->start_sect; + bio->bi_iter.bi_sector += p->bdev->bd_start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector - + p->bdev->bd_start_sect); } bio->bi_partno = 0; ret = 0; diff --git a/block/blk-lib.c b/block/blk-lib.c index e90614fd8d6a..752f9c722062 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -65,7 +65,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard request is in a partition */ if (bdev_is_partition(bdev)) - part_offset = bdev->bd_part->start_sect; + part_offset = bdev->bd_start_sect; while (nr_sects) { sector_t granularity_aligned_lba, req_sects; diff --git a/block/genhd.c b/block/genhd.c index 2cbda8139556..5efb2df1f079 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -305,8 +305,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { - return part->start_sect <= sector && - sector < part->start_sect + bdev_nr_sectors(part->bdev); + return part->bdev->bd_start_sect <= sector && + sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); } /** diff --git a/block/partitions/core.c b/block/partitions/core.c index 8924e1ea8b2a..460a745812c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -192,7 +192,7 @@ static ssize_t part_start_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, @@ -209,7 +209,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, @@ -219,7 +219,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -301,7 +301,7 @@ static void hd_struct_free_work(struct work_struct *work) */ put_device(disk_to_dev(disk)); - part->start_sect = 0; + part->bdev->bd_start_sect = 0; bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); @@ -416,7 +416,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); - p->start_sect = start; + bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -508,8 +508,9 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + bdev_nr_sectors(part->bdev) || - start + length <= part->start_sect) + start >= part->bdev->bd_start_sect + + bdev_nr_sectors(part->bdev) || + start + length <= part->bdev->bd_start_sect) continue; overlap = true; break; @@ -592,7 +593,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, mutex_lock_nested(&bdev->bd_mutex, 1); ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bdev->bd_start_sect) goto out_unlock; ret = -EBUSY; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 520011b95276..a690008f60cd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { + sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; dev_t bd_dev; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43a25d855e04..619adea57098 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1488,7 +1488,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.alignment_offset; } @@ -1529,7 +1529,7 @@ static inline int bdev_discard_alignment(struct block_device *bdev) if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.discard_alignment; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 804ac45fbfbc..50d27f5d38e2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -51,7 +51,6 @@ struct partition_meta_info { }; struct hd_struct { - sector_t start_sect; struct percpu_ref ref; struct block_device *bdev; @@ -298,7 +297,7 @@ extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { - return bdev->bd_part->start_sect; + return bdev->bd_start_sect; } static inline sector_t bdev_nr_sectors(struct block_device *bdev) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7076d588a50d..8a723a91ec5a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -458,14 +458,9 @@ static struct rchan_callbacks blk_relay_callbacks = { static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { - struct hd_struct *part = NULL; - - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); + if (bdev) { + bt->start_lba = bdev->bd_start_sect; + bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; -- cgit v1.2.3 From 231926dbf0f084211e4ec4f4c006f0bf1f47809a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 12:01:45 +0100 Subject: block: move the partition_meta_info to struct block_device Move the partition_meta_info to struct block_device in preparation for killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 1 - block/genhd.c | 3 ++- block/partitions/core.c | 18 +++++++----------- fs/block_dev.c | 1 + include/linux/blk_types.h | 2 ++ include/linux/genhd.h | 1 - init/do_mounts.c | 7 ++++--- 7 files changed, 16 insertions(+), 17 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index 3f801f6e86f8..0bd4b58bcbaf 100644 --- a/block/blk.h +++ b/block/blk.h @@ -381,7 +381,6 @@ static inline void hd_struct_put(struct hd_struct *part) static inline void hd_free_part(struct hd_struct *part) { - kfree(part->info); bdput(part->bdev); percpu_ref_exit(&part->ref); } diff --git a/block/genhd.c b/block/genhd.c index 5efb2df1f079..4273e89f07e8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1010,7 +1010,8 @@ void __init printk_all_partitions(void) bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); + part->bdev->bd_meta_info ? + part->bdev->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", diff --git a/block/partitions/core.c b/block/partitions/core.c index 460a745812c6..07df9ff55462 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -275,8 +275,9 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) struct hd_struct *part = dev_to_part(dev); add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); + if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", + part->bdev->bd_meta_info->volname); return 0; } @@ -422,13 +423,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, p->policy = get_disk_ro(disk); if (info) { - struct partition_meta_info *pinfo; - - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) goto out_bdput; - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; } dname = dev_name(ddev); @@ -444,7 +442,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, err = blk_alloc_devt(p, &devt); if (err) - goto out_free_info; + goto out_bdput; pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ @@ -481,8 +479,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, kobject_uevent(&pdev->kobj, KOBJ_ADD); return p; -out_free_info: - kfree(p->info); out_bdput: bdput(bdev); out_free: diff --git a/fs/block_dev.c b/fs/block_dev.c index 0832c7830f3a..0770f654b09c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -785,6 +785,7 @@ static void bdev_free_inode(struct inode *inode) struct block_device *bdev = I_BDEV(inode); free_percpu(bdev->bd_stats); + kfree(bdev->bd_meta_info); kmem_cache_free(bdev_cachep, BDEV_I(inode)); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index a690008f60cd..2f8ede04e5a9 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -49,6 +49,8 @@ struct block_device { /* Mutex for freeze */ struct mutex bd_fsfreeze_mutex; struct super_block *bd_fsfreeze_sb; + + struct partition_meta_info *bd_meta_info; } __randomize_layout; #define bdev_whole(_bdev) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 50d27f5d38e2..30d7076155b4 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -57,7 +57,6 @@ struct hd_struct { struct device __dev; struct kobject *holder_dir; int policy, partno; - struct partition_meta_info *info; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; #endif diff --git a/init/do_mounts.c b/init/do_mounts.c index 5879edf083b3..368ccb718501 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -79,8 +79,8 @@ static int match_dev_by_uuid(struct device *dev, const void *data) const struct uuidcmp *cmp = data; struct hd_struct *part = dev_to_part(dev); - if (!part->info || - strncasecmp(cmp->uuid, part->info->uuid, cmp->len)) + if (!part->bdev->bd_meta_info || + strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len)) return 0; return 1; } @@ -169,7 +169,8 @@ static int match_dev_by_label(struct device *dev, const void *data) const char *label = data; struct hd_struct *part = dev_to_part(dev); - if (!part->info || strcmp(label, part->info->volname)) + if (!part->bdev->bd_meta_info || + strcmp(label, part->bdev->bd_meta_info->volname)) return 0; return 1; } -- cgit v1.2.3 From 1bdd5ae0251d678488dffcf455d4633c2beef1bc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 19:00:13 +0100 Subject: block: move holder_dir to struct block_device Move the holder_dir field to struct block_device in preparation for kill struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 5 +++-- block/partitions/core.c | 8 ++++---- fs/block_dev.c | 11 +++++------ include/linux/blk_types.h | 1 + include/linux/genhd.h | 1 - 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 4273e89f07e8..0bd7026cee62 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -681,7 +681,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->part0.bdev->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -887,7 +888,7 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.holder_dir); + kobject_put(disk->part0.bdev->bd_holder_dir); kobject_put(disk->slave_dir); part_stat_set_all(&disk->part0, 0); diff --git a/block/partitions/core.c b/block/partitions/core.c index 07df9ff55462..c068471fa654 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -344,7 +344,7 @@ void delete_partition(struct hd_struct *part) */ get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); - kobject_put(part->holder_dir); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); /* @@ -452,8 +452,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) + bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); @@ -487,7 +487,7 @@ out_free: out_remove_file: device_remove_file(pdev, &dev_attr_whole_disk); out_del: - kobject_put(p->holder_dir); + kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); diff --git a/fs/block_dev.c b/fs/block_dev.c index 0770f654b09c..381c22426f43 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1142,7 +1142,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) + if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); @@ -1165,14 +1165,14 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) if (ret) goto out_free; - ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. */ - kobject_get(bdev->bd_part->holder_dir); + kobject_get(bdev->bd_holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; @@ -1207,9 +1207,8 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_part->holder_dir, - &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_part->holder_dir); + del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(bdev->bd_holder_dir); list_del_init(&holder->list); kfree(holder); } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 2f8ede04e5a9..c0591e52d7d7 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -35,6 +35,7 @@ struct block_device { #ifdef CONFIG_SYSFS struct list_head bd_holder_disks; #endif + struct kobject *bd_holder_dir; u8 bd_partno; struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 30d7076155b4..b4a5c05593b9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -55,7 +55,6 @@ struct hd_struct { struct block_device *bdev; struct device __dev; - struct kobject *holder_dir; int policy, partno; #ifdef CONFIG_FAIL_MAKE_REQUEST int make_it_fail; -- cgit v1.2.3 From b309e9936347232c724eaa13f70533128b4864e9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:28:47 +0100 Subject: block: move make_it_fail to struct block_device Move the make_it_fail flag to struct block_device an turn it into a bool in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 3 ++- block/genhd.c | 4 ++-- include/linux/blk_types.h | 3 +++ include/linux/genhd.h | 3 --- 4 files changed, 7 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9a3793d5ce38..9121390be97a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -668,7 +668,8 @@ __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { - return part->make_it_fail && should_fail(&fail_make_request, bytes); + return part->bdev->bd_make_it_fail && + should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) diff --git a/block/genhd.c b/block/genhd.c index 0bd7026cee62..f9c957739d4b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1292,7 +1292,7 @@ ssize_t part_fail_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->make_it_fail); + return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, @@ -1303,7 +1303,7 @@ ssize_t part_fail_store(struct device *dev, int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; + p->bdev->bd_make_it_fail = i; return count; } diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c0591e52d7d7..b237f1e40814 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -52,6 +52,9 @@ struct block_device { struct super_block *bd_fsfreeze_sb; struct partition_meta_info *bd_meta_info; +#ifdef CONFIG_FAIL_MAKE_REQUEST + bool bd_make_it_fail; +#endif } __randomize_layout; #define bdev_whole(_bdev) \ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index b4a5c05593b9..349cf6403ccd 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -56,9 +56,6 @@ struct hd_struct { struct block_device *bdev; struct device __dev; int policy, partno; -#ifdef CONFIG_FAIL_MAKE_REQUEST - int make_it_fail; -#endif struct rcu_work rcu_work; }; -- cgit v1.2.3 From 83950d359010a493462d58c712b1124c877d1b3b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 23 Nov 2020 16:36:02 +0100 Subject: block: move the policy field to struct block_device Move the policy field to struct block_device and rename it to the more descriptive bd_read_only. Also turn the field into a bool as it is used as such. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/genhd.c | 8 ++++---- block/ioctl.c | 2 +- block/partitions/core.c | 4 ++-- include/linux/blk_types.h | 1 + include/linux/genhd.h | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index 9121390be97a..d64ffcb6f9ae 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -696,7 +696,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); - if (part->policy && op_is_write(op)) { + if (part->bdev->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) diff --git a/block/genhd.c b/block/genhd.c index f9c957739d4b..2db1204920a9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1687,14 +1687,14 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.policy != flag) { + if (disk->part0.bdev->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.policy = flag; + disk->part0.bdev->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->policy = flag; + part->bdev->bd_read_only = flag; disk_part_iter_exit(&piter); } @@ -1704,7 +1704,7 @@ int bdev_read_only(struct block_device *bdev) { if (!bdev) return 0; - return bdev->bd_part->policy; + return bdev->bd_read_only; } EXPORT_SYMBOL(bdev_read_only); diff --git a/block/ioctl.c b/block/ioctl.c index a6d8171221c7..d61d652078f4 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -345,7 +345,7 @@ static int blkdev_roset(struct block_device *bdev, fmode_t mode, if (ret) return ret; } - bdev->bd_part->policy = n; + bdev->bd_read_only = n; return 0; } diff --git a/block/partitions/core.c b/block/partitions/core.c index c068471fa654..060c1be13cd8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -199,7 +199,7 @@ static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); + return sprintf(buf, "%d\n", p->bdev->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, @@ -420,7 +420,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; - p->policy = get_disk_ro(disk); + bdev->bd_read_only = get_disk_ro(disk); if (info) { err = -ENOMEM; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index b237f1e40814..758cf71c9aa2 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -23,6 +23,7 @@ struct block_device { sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; + bool bd_read_only; /* read-only policy */ dev_t bd_dev; int bd_openers; struct inode * bd_inode; /* will die */ diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 349cf6403ccd..dcbf9ef7610e 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -55,7 +55,7 @@ struct hd_struct { struct block_device *bdev; struct device __dev; - int policy, partno; + int partno; struct rcu_work rcu_work; }; @@ -278,7 +278,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0.policy; + return disk->part0.bdev->bd_read_only; } extern void disk_block_events(struct gendisk *disk); -- cgit v1.2.3 From cb8432d650fe3be58bb962bc8e602dc405510327 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:47:17 +0100 Subject: block: allocate struct hd_struct as part of struct bdev_inode Allocate hd_struct together with struct block_device to pre-load the lifetime rule changes in preparation of merging the two structures. Note that part0 was previously embedded into struct gendisk, but is a separate allocation now, and already points to the block_device instead of the hd_struct. The lifetime of struct gendisk is still controlled by the struct device embedded in the part0 hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++----- block/blk-flush.c | 2 +- block/blk-merge.c | 2 -- block/blk.h | 21 ------------ block/genhd.c | 50 +++++++++++----------------- block/partitions/core.c | 67 ++++---------------------------------- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 3 +- drivers/block/zram/zram_drv.c | 2 +- drivers/md/dm.c | 4 +-- drivers/md/md.c | 2 +- fs/block_dev.c | 39 +++++++--------------- include/linux/blk_types.h | 2 +- include/linux/genhd.h | 14 ++++---- include/linux/part_stat.h | 4 +-- 15 files changed, 61 insertions(+), 169 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index d64ffcb6f9ae..9ea70275fc1c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -714,7 +714,8 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0->bd_part, + bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -831,7 +832,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1203,7 +1204,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1272,7 +1273,7 @@ again: __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { - part = &part_to_disk(part)->part0; + part = part_to_disk(part)->part0->bd_part; goto again; } } @@ -1309,8 +1310,6 @@ void blk_account_io_done(struct request *req, u64 now) part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); - - hd_struct_put(part); } } @@ -1354,7 +1353,7 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(&disk->part0, sectors, op); + return __part_start_io_acct(disk->part0->bd_part, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); @@ -1376,14 +1375,13 @@ void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); - hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(&disk->part0, op, start_time); + __part_end_io_acct(disk->part0->bd_part, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index e32958f0b687..fcd0a60574df 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = &rq->rq_disk->part0; + struct hd_struct *part = rq->rq_disk->part0->bd_part; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-merge.c b/block/blk-merge.c index bcf5e4580603..cb351ab9b77d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -683,8 +683,6 @@ static void blk_account_io_merge_request(struct request *req) part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); part_stat_unlock(); - - hd_struct_put(req->part); } } diff --git a/block/blk.h b/block/blk.h index 0bd4b58bcbaf..32ac41f7557f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -363,27 +363,6 @@ int bdev_del_partition(struct block_device *bdev, int partno); int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int disk_expand_part_tbl(struct gendisk *disk, int target); -int hd_ref_init(struct hd_struct *part); - -/* no need to get/put refcount of part0 */ -static inline int hd_struct_try_get(struct hd_struct *part) -{ - if (part->partno) - return percpu_ref_tryget_live(&part->ref); - return 1; -} - -static inline void hd_struct_put(struct hd_struct *part) -{ - if (part->partno) - percpu_ref_put(&part->ref); -} - -static inline void hd_free_part(struct hd_struct *part) -{ - bdput(part->bdev); - percpu_ref_exit(&part->ref); -} int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, diff --git a/block/genhd.c b/block/genhd.c index 2db1204920a9..c35b03dac5e5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -42,7 +42,7 @@ static void disk_release_events(struct gendisk *disk); void set_capacity(struct gendisk *disk, sector_t sectors) { - struct block_device *bdev = disk->part0.bdev; + struct block_device *bdev = disk->part0; spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); @@ -318,9 +318,7 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * primarily used for stats accounting. * * CONTEXT: - * RCU read locked. The returned partition pointer is always valid - * because its refcount is grabbed except for part0, which lifetime - * is same with the disk. + * RCU read locked. * * RETURNS: * Found partition on success, part0 is returned if no partition matches @@ -336,26 +334,19 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) ptbl = rcu_dereference(disk->part_tbl); part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) + if (part && sector_in_part(part, sector)) goto out_unlock; for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); if (part && sector_in_part(part, sector)) { - /* - * only live partition can be cached for lookup, - * so use-after-free on cached & deleting partition - * can be avoided - */ - if (!hd_struct_try_get(part)) - break; rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; } } - part = &disk->part0; + part = disk->part0->bd_part; out_unlock: rcu_read_unlock(); return part; @@ -681,8 +672,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.bdev->bd_holder_dir = - kobject_create_and_add("holders", &ddev->kobj); + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); if (disk->flags & GENHD_FL_HIDDEN) { @@ -748,7 +739,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(&disk->part0, &devt); + retval = blk_alloc_devt(disk->part0->bd_part, &devt); if (retval) { WARN_ON(1); return; @@ -775,7 +766,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); WARN_ON(ret); bdi_set_owner(bdi, dev); - bdev_add(disk->part0.bdev, devt); + bdev_add(disk->part0, devt); } register_disk(parent, disk, groups); if (register_queue) @@ -888,11 +879,11 @@ void del_gendisk(struct gendisk *disk) blk_unregister_queue(disk); - kobject_put(disk->part0.bdev->bd_holder_dir); + kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(&disk->part0, 0); - disk->part0.bdev->bd_stamp = 0; + part_stat_set_all(disk->part0->bd_part, 0); + disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); @@ -1005,7 +996,7 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; + bool is_part0 = part == disk->part0->bd_part; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), @@ -1460,7 +1451,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); disk_replace_part_tbl(disk, NULL); - hd_free_part(&disk->part0); + bdput(disk->part0); if (disk->queue) blk_put_queue(disk->queue); kfree(disk); @@ -1626,8 +1617,8 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) if (!disk) return NULL; - disk->part0.bdev = bdev_alloc(disk, 0); - if (!disk->part0.bdev) + disk->part0 = bdev_alloc(disk, 0); + if (!disk->part0) goto out_free_disk; disk->node_id = node_id; @@ -1635,10 +1626,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); - - if (hd_ref_init(&disk->part0)) - goto out_bdput; + rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); disk->minors = minors; rand_initialize_disk(disk); @@ -1648,7 +1636,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) return disk; out_bdput: - bdput(disk->part0.bdev); + bdput(disk->part0); out_free_disk: kfree(disk); return NULL; @@ -1687,9 +1675,9 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; - if (disk->part0.bdev->bd_read_only != flag) { + if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); - disk->part0.bdev->bd_read_only = flag; + disk->part0->bd_read_only = flag; } disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); diff --git a/block/partitions/core.c b/block/partitions/core.c index 060c1be13cd8..6d1fca193cbd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -265,9 +265,9 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); + bdput(p->bdev); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) @@ -288,46 +288,6 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void hd_struct_free_work(struct work_struct *work) -{ - struct hd_struct *part = - container_of(to_rcu_work(work), struct hd_struct, rcu_work); - struct gendisk *disk = part_to_disk(part); - - /* - * Release the disk reference acquired in delete_partition here. - * We can't release it in hd_struct_free because the final put_device - * needs process context and thus can't be run directly from a - * percpu_ref ->release handler. - */ - put_device(disk_to_dev(disk)); - - part->bdev->bd_start_sect = 0; - bdev_set_nr_sectors(part->bdev, 0); - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -static void hd_struct_free(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->last_lookup, NULL); - - INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); - queue_rcu_work(system_wq, &part->rcu_work); -} - -int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) - return -ENOMEM; - return 0; -} - /* * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. @@ -342,8 +302,8 @@ void delete_partition(struct hd_struct *part) * ->part_tbl is referenced in this part's release handler, so * we have to hold the disk device */ - get_device(disk_to_dev(disk)); rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -353,7 +313,7 @@ void delete_partition(struct hd_struct *part) */ remove_inode_hash(part->bdev->bd_inode); - percpu_ref_kill(&part->ref); + put_device(part_to_dev(part)); } static ssize_t whole_disk_show(struct device *dev, @@ -406,15 +366,11 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (ptbl->part[partno]) return ERR_PTR(-EBUSY); - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return ERR_PTR(-EBUSY); - bdev = bdev_alloc(disk, partno); if (!bdev) - goto out_free; - p->bdev = bdev; + return ERR_PTR(-ENOMEM); + p = bdev->bd_part; pdev = part_to_dev(p); bdev->bd_start_sect = start; @@ -463,13 +419,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - /* everything is up and running, commence */ bdev_add(bdev, devt); rcu_assign_pointer(ptbl->part[partno], p); @@ -481,11 +430,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, out_bdput: bdput(bdev); -out_free: - kfree(p); return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); out_del: kobject_put(bdev->bd_holder_dir); device_del(pdev); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index dc333dbe5232..9e5c2fdfda36 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index ba56f3f05312..343f56b86bb7 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1678,7 +1678,8 @@ void drbd_rs_controller_reset(struct drbd_device *device) atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; - device->rs_last_events = (int)part_stat_read_accum(&disk->part0, sectors); + device->rs_last_events = + (int)part_stat_read_accum(disk->part0->bd_part, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b5f68951c9d2..6d84876a9cd0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; set_capacity_and_notify(zram->disk, 0); - part_stat_set_all(&zram->disk->part0, 0); + part_stat_set_all(zram->disk->part0->bd_part, 0); up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 48051db006f3..1b2db4d530ea 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * (by eliminating DM's splitting and just using bio_split) */ part_stat_lock(); - __dm_part_stat_sub(&dm_disk(md)->part0, + __dm_part_stat_sub(dm_disk(md)->part0->bd_part, sectors[op_stat_group(bio_op(bio))], ci.sector_count); part_stat_unlock(); @@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put); static bool md_in_flight_bios(struct mapped_device *md) { int cpu; - struct hd_struct *part = &dm_disk(md)->part0; + struct hd_struct *part = dm_disk(md)->part0->bd_part; long sum = 0; for_each_possible_cpu(cpu) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 7ce6047c856e..3696c2d77a4d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(&disk->part0, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and diff --git a/fs/block_dev.c b/fs/block_dev.c index 381c22426f43..61cf33b6284f 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -39,6 +39,7 @@ struct bdev_inode { struct block_device bdev; + struct hd_struct hd; struct inode vfs_inode; }; @@ -886,6 +887,9 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } + bdev->bd_part = &BDEV_I(inode)->hd; + memset(bdev->bd_part, 0, sizeof(*bdev->bd_part)); + bdev->bd_part->bdev = bdev; return bdev; } @@ -1280,15 +1284,10 @@ EXPORT_SYMBOL_GPL(bdev_disk_changed); static int __blkdev_get(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - int ret; + int ret = 0; if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { - ret = -ENXIO; - bdev->bd_part = disk_get_part(disk, 0); - if (!bdev->bd_part) - goto out_clear; - ret = 0; if (disk->fops->open) ret = disk->fops->open(bdev, mode); @@ -1307,7 +1306,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) - goto out_clear; + return ret; } else { struct block_device *whole = bdget_disk(disk, 0); @@ -1316,18 +1315,16 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (ret) { mutex_unlock(&whole->bd_mutex); bdput(whole); - goto out_clear; + return ret; } whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev_nr_sectors(bdev)) { + !bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); - ret = -ENXIO; - goto out_clear; + return -ENXIO; } set_init_blocksize(bdev); } @@ -1336,7 +1333,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (!bdev_is_partition(bdev)) { - ret = 0; if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ @@ -1349,11 +1345,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) } bdev->bd_openers++; return 0; - - out_clear: - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - return ret; } struct block_device *blkdev_get_no_open(dev_t dev) @@ -1580,18 +1571,12 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); - - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); - - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; if (bdev_is_partition(bdev)) victim = bdev_whole(bdev); - } else { - if (!bdev_is_partition(bdev) && disk->fops->release) - disk->fops->release(disk, mode); } + + if (!bdev_is_partition(bdev) && disk->fops->release) + disk->fops->release(disk, mode); mutex_unlock(&bdev->bd_mutex); if (victim) { __blkdev_put(victim, mode, 1); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 758cf71c9aa2..6edea5c16259 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -59,7 +59,7 @@ struct block_device { } __randomize_layout; #define bdev_whole(_bdev) \ - ((_bdev)->bd_disk->part0.bdev) + ((_bdev)->bd_disk->part0) #define bdev_kobj(_bdev) \ (&part_to_dev((_bdev)->bd_part)->kobj) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index dcbf9ef7610e..df7319da013c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -19,11 +19,12 @@ #include #include -#define dev_to_disk(device) container_of((device), struct gendisk, part0.__dev) #define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define disk_to_dev(disk) (&(disk)->part0.__dev) #define part_to_dev(part) (&((part)->__dev)) +#define dev_to_disk(device) (dev_to_part(device)->bdev->bd_disk) +#define disk_to_dev(disk) (part_to_dev((disk)->part0->bd_part)) + extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; @@ -51,12 +52,9 @@ struct partition_meta_info { }; struct hd_struct { - struct percpu_ref ref; - struct block_device *bdev; struct device __dev; int partno; - struct rcu_work rcu_work; }; /** @@ -168,7 +166,7 @@ struct gendisk { * helpers. */ struct disk_part_tbl __rcu *part_tbl; - struct hd_struct part0; + struct block_device *part0; const struct block_device_operations *fops; struct request_queue *queue; @@ -278,7 +276,7 @@ extern void set_disk_ro(struct gendisk *disk, int flag); static inline int get_disk_ro(struct gendisk *disk) { - return disk->part0.bdev->bd_read_only; + return disk->part0->bd_read_only; } extern void disk_block_events(struct gendisk *disk); @@ -302,7 +300,7 @@ static inline sector_t bdev_nr_sectors(struct block_device *bdev) static inline sector_t get_capacity(struct gendisk *disk) { - return bdev_nr_sectors(disk->part0.bdev); + return bdev_nr_sectors(disk->part0); } int bdev_disk_changed(struct block_device *bdev, bool invalidate); diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 87ad60106e1d..680de036691e 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -59,8 +59,8 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ if ((part)->partno) \ - __part_stat_add(&part_to_disk((part))->part0, \ - field, addnd); \ + __part_stat_add(part_to_disk((part))->part0->bd_part, \ + field, addnd); \ } while (0) #define part_stat_dec(part, field) \ -- cgit v1.2.3 From 8446fe9255be821cb38ffd306d7e8edc4b9ea662 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:36:54 +0100 Subject: block: switch partition lookup to use struct block_device Use struct block_device to lookup partitions on a disk. This removes all usage of struct hd_struct from the I/O path. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +-- block/blk-core.c | 66 +++++++++++++++++--------------------- block/blk-flush.c | 2 +- block/blk-mq.c | 9 +++--- block/blk-mq.h | 7 ++-- block/blk.h | 4 +-- block/genhd.c | 57 +++++++++++++++++--------------- block/partitions/core.c | 7 ++-- drivers/block/drbd/drbd_receiver.c | 2 +- drivers/block/drbd/drbd_worker.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/md/bcache/request.c | 4 +-- drivers/md/dm.c | 4 +-- drivers/md/md.c | 4 +-- drivers/nvme/target/admin-cmd.c | 20 ++++++------ fs/ext4/super.c | 18 ++++------- fs/ext4/sysfs.c | 10 ++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/super.c | 6 ++-- include/linux/blkdev.h | 8 ++--- include/linux/genhd.h | 4 +-- include/linux/part_stat.h | 17 +++++----- 22 files changed, 122 insertions(+), 137 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index 669bb47a3198..ebb18136b86f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -608,12 +608,12 @@ void bio_truncate(struct bio *bio, unsigned new_size) void guard_bio_eod(struct bio *bio) { sector_t maxsector; - struct hd_struct *part; + struct block_device *part; rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = bdev_nr_sectors(part->bdev); + maxsector = bdev_nr_sectors(part); else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 9ea70275fc1c..cee568389b7e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -666,10 +666,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +static bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->bdev->bd_make_it_fail && - should_fail(&fail_make_request, bytes); + return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -684,7 +683,7 @@ late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool should_fail_request(struct hd_struct *part, +static inline bool should_fail_request(struct block_device *part, unsigned int bytes) { return false; @@ -692,11 +691,11 @@ static inline bool should_fail_request(struct hd_struct *part, #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +static inline bool bio_check_ro(struct bio *bio, struct block_device *part) { const int op = bio_op(bio); - if (part->bdev->bd_read_only && op_is_write(op)) { + if (part->bd_read_only && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) @@ -704,7 +703,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->partno); + bio_devname(bio, b), part->bd_partno); /* Older lvm-tools actually trigger this */ return false; } @@ -714,8 +713,7 @@ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0->bd_part, - bio->bi_iter.bi_size)) + if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -744,7 +742,7 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) */ static inline int blk_partition_remap(struct bio *bio) { - struct hd_struct *p; + struct block_device *p; int ret = -EIO; rcu_read_lock(); @@ -757,12 +755,12 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) + if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; - bio->bi_iter.bi_sector += p->bdev->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector += p->bd_start_sect; + trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, bio->bi_iter.bi_sector - - p->bdev->bd_start_sect); + p->bd_start_sect); } bio->bi_partno = 0; ret = 0; @@ -832,7 +830,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0->bd_part))) + if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; @@ -1204,7 +1202,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return ret; if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0->bd_part, blk_rq_bytes(rq))) + should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -1263,17 +1261,18 @@ unsigned int blk_rq_err_bytes(const struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); -static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +static void update_io_ticks(struct block_device *part, unsigned long now, + bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->bdev->bd_stamp); + stamp = READ_ONCE(part->bd_stamp); if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->bdev->bd_stamp, stamp, now) == stamp)) + if (likely(cmpxchg(&part->bd_stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } - if (part->partno) { - part = part_to_disk(part)->part0->bd_part; + if (part->bd_partno) { + part = bdev_whole(part); goto again; } } @@ -1282,11 +1281,9 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - part_stat_add(part, sectors[sgrp], bytes >> 9); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -1301,14 +1298,11 @@ void blk_account_io_done(struct request *req, u64 now) if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; part_stat_lock(); - part = req->part; - - update_io_ticks(part, jiffies, true); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); } } @@ -1325,7 +1319,7 @@ void blk_account_io_start(struct request *rq) part_stat_unlock(); } -static unsigned long __part_start_io_acct(struct hd_struct *part, +static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); @@ -1341,7 +1335,7 @@ static unsigned long __part_start_io_acct(struct hd_struct *part, return now; } -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, +unsigned long part_start_io_acct(struct gendisk *disk, struct block_device **part, struct bio *bio) { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); @@ -1353,11 +1347,11 @@ EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(disk->part0->bd_part, sectors, op); + return __part_start_io_acct(disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); -static void __part_end_io_acct(struct hd_struct *part, unsigned int op, +static void __part_end_io_acct(struct block_device *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); @@ -1371,7 +1365,7 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op, part_stat_unlock(); } -void part_end_io_acct(struct hd_struct *part, struct bio *bio, +void part_end_io_acct(struct block_device *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); @@ -1381,7 +1375,7 @@ EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(disk->part0->bd_part, op, start_time); + __part_end_io_acct(disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); diff --git a/block/blk-flush.c b/block/blk-flush.c index fcd0a60574df..9507dcdd5881 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -139,7 +139,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = rq->rq_disk->part0->bd_part; + struct block_device *part = rq->rq_disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); diff --git a/block/blk-mq.c b/block/blk-mq.c index 55bcee5dc032..a2593748fa53 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -95,7 +95,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, } struct mq_inflight { - struct hd_struct *part; + struct block_device *part; unsigned int inflight[2]; }; @@ -111,7 +111,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part) { struct mq_inflight mi = { .part = part }; @@ -120,8 +121,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) return mi.inflight[0] + mi.inflight[1]; } -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; diff --git a/block/blk-mq.h b/block/blk-mq.h index a52703c98b77..c696515766c7 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -182,9 +182,10 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]); static inline void blk_mq_put_dispatch_budget(struct request_queue *q) { diff --git a/block/blk.h b/block/blk.h index 32ac41f7557f..d5bf8f3a0781 100644 --- a/block/blk.h +++ b/block/blk.h @@ -215,7 +215,7 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); +struct block_device *__disk_get_part(struct gendisk *disk, int partno); ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); @@ -348,7 +348,7 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q); static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} #endif -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); int blk_alloc_devt(struct hd_struct *part, dev_t *devt); void blk_free_devt(dev_t devt); diff --git a/block/genhd.c b/block/genhd.c index c35b03dac5e5..ed06466b305d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -126,7 +126,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct hd_struct *part) +static unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -141,7 +141,8 @@ static unsigned int part_in_flight(struct hd_struct *part) return inflight; } -static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +static void part_in_flight_rw(struct block_device *part, + unsigned int inflight[2]) { int cpu; @@ -157,7 +158,7 @@ static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) inflight[1] = 0; } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +struct block_device *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); @@ -182,15 +183,21 @@ struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) */ struct hd_struct *disk_get_part(struct gendisk *disk, int partno) { + struct block_device *bdev; struct hd_struct *part; rcu_read_lock(); - part = __disk_get_part(disk, partno); - if (part) - get_device(part_to_dev(part)); + bdev = __disk_get_part(disk, partno); + if (!bdev) + goto fail; + part = bdev->bd_part; + if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) + goto fail; rcu_read_unlock(); - return part; +fail: + rcu_read_unlock(); + return NULL; } /** @@ -264,19 +271,19 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) /* iterate to the next partition */ for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; + struct block_device *part; part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!bdev_nr_sectors(part->bdev) && + if (!bdev_nr_sectors(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) continue; - get_device(part_to_dev(part)); - piter->part = part; + get_device(part_to_dev(part->bd_part)); + piter->part = part->bd_part; piter->idx += inc; break; } @@ -303,10 +310,10 @@ void disk_part_iter_exit(struct disk_part_iter *piter) } EXPORT_SYMBOL_GPL(disk_part_iter_exit); -static inline int sector_in_part(struct hd_struct *part, sector_t sector) +static inline int sector_in_part(struct block_device *part, sector_t sector) { - return part->bdev->bd_start_sect <= sector && - sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); + return part->bd_start_sect <= sector && + sector < part->bd_start_sect + bdev_nr_sectors(part); } /** @@ -324,10 +331,10 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector) * Found partition on success, part0 is returned if no partition matches * or the matched partition is being deleted. */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) { struct disk_part_tbl *ptbl; - struct hd_struct *part; + struct block_device *part; int i; rcu_read_lock(); @@ -346,7 +353,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) } } - part = disk->part0->bd_part; + part = disk->part0; out_unlock: rcu_read_unlock(); return part; @@ -882,7 +889,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); - part_stat_set_all(disk->part0->bd_part, 0); + part_stat_set_all(disk->part0, 0); disk->part0->bd_stamp = 0; if (!sysfs_deprecated) sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); @@ -1189,9 +1196,9 @@ ssize_t part_stat_show(struct device *dev, part_stat_read_all(p, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); + inflight = blk_mq_in_flight(q, p->bdev); else - inflight = part_in_flight(p); + inflight = part_in_flight(p->bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1231,9 +1238,9 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p, inflight); + blk_mq_in_flight_rw(q, p->bdev, inflight); else - part_in_flight_rw(p, inflight); + part_in_flight_rw(p->bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1506,9 +1513,9 @@ static int diskstats_show(struct seq_file *seqf, void *v) while ((hd = disk_part_iter_next(&piter))) { part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); + inflight = blk_mq_in_flight(gp->queue, hd->bdev); else - inflight = part_in_flight(hd); + inflight = part_in_flight(hd->bdev); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1626,7 +1633,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) goto out_bdput; ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], disk->part0->bd_part); + rcu_assign_pointer(ptbl->part[0], disk->part0); disk->minors = minors; rand_initialize_disk(disk); diff --git a/block/partitions/core.c b/block/partitions/core.c index 6d1fca193cbd..c2f6721633b8 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -298,12 +298,9 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - /* - * ->part_tbl is referenced in this part's release handler, so - * we have to hold the disk device - */ rcu_assign_pointer(ptbl->part[part->partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); + kobject_put(part->bdev->bd_holder_dir); device_del(part_to_dev(part)); @@ -421,7 +418,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ bdev_add(bdev, devt); - rcu_assign_pointer(ptbl->part[partno], p); + rcu_assign_pointer(ptbl->part[partno], bdev); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9e5c2fdfda36..09c86ef3f0fd 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2802,7 +2802,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) if (c_min_rate == 0) return false; - curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 343f56b86bb7..02044ab7f767 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1679,7 +1679,7 @@ void drbd_rs_controller_reset(struct drbd_device *device) atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; device->rs_last_events = - (int)part_stat_read_accum(disk->part0->bd_part, sectors); + (int)part_stat_read_accum(disk->part0, sectors); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 6d84876a9cd0..dc8957d173d3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1687,7 +1687,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; set_capacity_and_notify(zram->disk, 0); - part_stat_set_all(zram->disk->part0->bd_part, 0); + part_stat_set_all(zram->disk->part0, 0); up_write(&zram->init_lock); /* I/O operation under all of CPU are done so let's free */ diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index afac8d07c1bd..85b1f2a9b72d 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -475,7 +475,7 @@ struct search { unsigned int read_dirty_data:1; unsigned int cache_missed:1; - struct hd_struct *part; + struct block_device *part; unsigned long start_time; struct btree_op op; @@ -1073,7 +1073,7 @@ struct detached_dev_io_private { unsigned long start_time; bio_end_io_t *bi_end_io; void *bi_private; - struct hd_struct *part; + struct block_device *part; }; static void detached_dev_end_io(struct bio *bio) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 1b2db4d530ea..176adcff56b3 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1607,7 +1607,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * (by eliminating DM's splitting and just using bio_split) */ part_stat_lock(); - __dm_part_stat_sub(dm_disk(md)->part0->bd_part, + __dm_part_stat_sub(dm_disk(md)->part0, sectors[op_stat_group(bio_op(bio))], ci.sector_count); part_stat_unlock(); @@ -2242,7 +2242,7 @@ EXPORT_SYMBOL_GPL(dm_put); static bool md_in_flight_bios(struct mapped_device *md) { int cpu; - struct hd_struct *part = dm_disk(md)->part0->bd_part; + struct block_device *part = dm_disk(md)->part0; long sum = 0; for_each_possible_cpu(cpu) { diff --git a/drivers/md/md.c b/drivers/md/md.c index 3696c2d77a4d..0065736f05b4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -464,7 +464,7 @@ struct md_io { bio_end_io_t *orig_bi_end_io; void *orig_bi_private; unsigned long start_time; - struct hd_struct *part; + struct block_device *part; }; static void md_end_io(struct bio *bio) @@ -8441,7 +8441,7 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; - curr_events = (int)part_stat_read_accum(disk->part0->bd_part, sectors) - + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats * as sync_io is counted when a request starts, and diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dca34489a1dc..8d90235e4fcc 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -89,12 +89,12 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, if (!ns->bdev) goto out; - host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); - data_units_read = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, - sectors[READ]), 1000); - host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); - data_units_written = DIV_ROUND_UP(part_stat_read(ns->bdev->bd_part, - sectors[WRITE]), 1000); + host_reads = part_stat_read(ns->bdev, ios[READ]); + data_units_read = + DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes = part_stat_read(ns->bdev, ios[WRITE]); + data_units_written = + DIV_ROUND_UP(part_stat_read(ns->bdev, sectors[WRITE]), 1000); put_unaligned_le64(host_reads, &slog->host_reads[0]); put_unaligned_le64(data_units_read, &slog->data_units_read[0]); @@ -120,12 +120,12 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, /* we don't have the right data for file backed ns */ if (!ns->bdev) continue; - host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); + host_reads += part_stat_read(ns->bdev, ios[READ]); data_units_read += DIV_ROUND_UP( - part_stat_read(ns->bdev->bd_part, sectors[READ]), 1000); - host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); + part_stat_read(ns->bdev, sectors[READ]), 1000); + host_writes += part_stat_read(ns->bdev, ios[WRITE]); data_units_written += DIV_ROUND_UP( - part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000); + part_stat_read(ns->bdev, sectors[WRITE]), 1000); } put_unaligned_le64(host_reads, &slog->host_reads[0]); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6633b20224d5..c303a0ff0b17 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4048,9 +4048,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - if (sb->s_bdev->bd_part) - sbi->s_sectors_written_start = - part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]); + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); /* Cleanup superblock name */ strreplace(sb->s_id, '/', '!'); @@ -5509,15 +5508,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & SB_RDONLY)) ext4_update_tstamp(es, s_wtime); - if (sb->s_bdev->bd_part) - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); - else - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + EXT4_SB(sb)->s_sectors_written_start) >> 1)); if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) ext4_free_blocks_count_set(es, EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 4e27fe6ed3ae..075aa3a19ff5 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -62,11 +62,8 @@ static ssize_t session_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", - (part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + (part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - sbi->s_sectors_written_start) >> 1); } @@ -74,12 +71,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_sb_info *sbi, char *buf) { struct super_block *sb = sbi->s_buddy_cache->i_sb; - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + - ((part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]) - + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - EXT4_SB(sb)->s_sectors_written_start) >> 1))); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cb700d797296..49681a8d2b14 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1675,7 +1675,7 @@ static inline bool f2fs_is_multi_device(struct f2fs_sb_info *sbi) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[STAT_WRITE]) - \ + (((u64)part_stat_read((s)->sb->s_bdev, sectors[STAT_WRITE]) - \ (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d4e7fab352ba..af9f449da64b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3700,10 +3700,8 @@ try_onemore: } /* For write statistics */ - if (sb->s_bdev->bd_part) - sbi->sectors_written_start = - (u64)part_stat_read(sb->s_bdev->bd_part, - sectors[STAT_WRITE]); + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev, sectors[STAT_WRITE]); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 619adea57098..1d4be1fc6007 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -191,7 +191,7 @@ struct request { }; struct gendisk *rq_disk; - struct hd_struct *part; + struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; @@ -1943,9 +1943,9 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); -unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, - struct bio *bio); -void part_end_io_acct(struct hd_struct *part, struct bio *bio, +unsigned long part_start_io_acct(struct gendisk *disk, + struct block_device **part, struct bio *bio); +void part_end_io_acct(struct block_device *part, struct bio *bio, unsigned long start_time); /** diff --git a/include/linux/genhd.h b/include/linux/genhd.h index df7319da013c..fe6fee77e2b9 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -131,8 +131,8 @@ enum { struct disk_part_tbl { struct rcu_head rcu_head; int len; - struct hd_struct __rcu *last_lookup; - struct hd_struct __rcu *part[]; + struct block_device __rcu *last_lookup; + struct block_device __rcu *part[]; }; struct disk_events; diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h index 680de036691e..d2558121d48c 100644 --- a/include/linux/part_stat.h +++ b/include/linux/part_stat.h @@ -25,26 +25,26 @@ struct disk_stats { #define part_stat_unlock() preempt_enable() #define part_stat_get_cpu(part, field, cpu) \ - (per_cpu_ptr((part)->bdev->bd_stats, (cpu))->field) + (per_cpu_ptr((part)->bd_stats, (cpu))->field) #define part_stat_get(part, field) \ part_stat_get_cpu(part, field, smp_processor_id()) #define part_stat_read(part, field) \ ({ \ - typeof((part)->bdev->bd_stats->field) res = 0; \ + typeof((part)->bd_stats->field) res = 0; \ unsigned int _cpu; \ for_each_possible_cpu(_cpu) \ - res += per_cpu_ptr((part)->bdev->bd_stats, _cpu)->field; \ + res += per_cpu_ptr((part)->bd_stats, _cpu)->field; \ res; \ }) -static inline void part_stat_set_all(struct hd_struct *part, int value) +static inline void part_stat_set_all(struct block_device *part, int value) { int i; for_each_possible_cpu(i) - memset(per_cpu_ptr(part->bdev->bd_stats, i), value, + memset(per_cpu_ptr(part->bd_stats, i), value, sizeof(struct disk_stats)); } @@ -54,13 +54,12 @@ static inline void part_stat_set_all(struct hd_struct *part, int value) part_stat_read(part, field[STAT_DISCARD])) #define __part_stat_add(part, field, addnd) \ - __this_cpu_add((part)->bdev->bd_stats->field, addnd) + __this_cpu_add((part)->bd_stats->field, addnd) #define part_stat_add(part, field, addnd) do { \ __part_stat_add((part), field, addnd); \ - if ((part)->partno) \ - __part_stat_add(part_to_disk((part))->part0->bd_part, \ - field, addnd); \ + if ((part)->bd_partno) \ + __part_stat_add(bdev_whole(part), field, addnd); \ } while (0) #define part_stat_dec(part, field) \ -- cgit v1.2.3 From 41e5c81984eac8ce87f2b4f57fec0bd90a049b2b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:37:14 +0100 Subject: block: remove the partno field from struct hd_struct Just use the bd_partno field in struct block_device everywhere. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 12 ++++++------ block/partitions/core.c | 9 ++++----- include/linux/genhd.h | 1 - init/do_mounts.c | 2 +- 4 files changed, 11 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index ed06466b305d..b7e39b41a275 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -589,8 +589,8 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) int idx; /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); + if (part->bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); return 0; } @@ -864,7 +864,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); + invalidate_partition(disk, part->bdev->bd_partno); delete_partition(part); } disk_part_iter_exit(&piter); @@ -1008,7 +1008,7 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->partno, name_buf), + disk_name(disk, part->bdev->bd_partno, name_buf), part->bdev->bd_meta_info ? part->bdev->bd_meta_info->uuid : ""); if (is_part0) { @@ -1102,7 +1102,7 @@ static int show_partition(struct seq_file *seqf, void *v) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->partno, buf)); + disk_name(sgp, part->bdev->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1525,7 +1525,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %u" "\n", MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), + disk_name(gp, hd->bdev->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], diff --git a/block/partitions/core.c b/block/partitions/core.c index c2f6721633b8..6db9ca8b722d 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -184,7 +184,7 @@ static ssize_t part_partition_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->partno); + return sprintf(buf, "%d\n", p->bdev->bd_partno); } static ssize_t part_start_show(struct device *dev, @@ -274,7 +274,7 @@ static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { struct hd_struct *part = dev_to_part(dev); - add_uevent_var(env, "PARTN=%u", part->partno); + add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) add_uevent_var(env, "PARTNAME=%s", part->bdev->bd_meta_info->volname); @@ -298,7 +298,7 @@ void delete_partition(struct hd_struct *part) struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->partno], NULL); + rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->bdev->bd_holder_dir); @@ -372,7 +372,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); - p->partno = partno; bdev->bd_read_only = get_disk_ro(disk); if (info) { @@ -445,7 +444,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->partno == skip_partno || + if (part->bdev->bd_partno == skip_partno || start >= part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->bdev->bd_start_sect) diff --git a/include/linux/genhd.h b/include/linux/genhd.h index fe6fee77e2b9..3c13d4708e3f 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -54,7 +54,6 @@ struct partition_meta_info { struct hd_struct { struct block_device *bdev; struct device __dev; - int partno; }; /** diff --git a/init/do_mounts.c b/init/do_mounts.c index 368ccb718501..86bef93e72eb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -136,7 +136,7 @@ static dev_t devt_from_partuuid(const char *uuid_str) struct hd_struct *part; part = disk_get_part(dev_to_disk(dev), - dev_to_part(dev)->partno + offset); + dev_to_part(dev)->bdev->bd_partno + offset); if (part) { devt = part_devt(part); put_device(part_to_dev(part)); -- cgit v1.2.3 From 9fc995a6e08349b5c5baff2cc31544b96ee2b1c3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:17:46 +0100 Subject: block: pass a block_device to blk_alloc_devt Pass the block_device actually needed instead of the hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 14 +++++++------- block/partitions/core.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'block') diff --git a/block/blk.h b/block/blk.h index d5bf8f3a0781..9657c6da7c77 100644 --- a/block/blk.h +++ b/block/blk.h @@ -350,7 +350,7 @@ static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); -int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +int blk_alloc_devt(struct block_device *part, dev_t *devt); void blk_free_devt(dev_t devt); char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 diff --git a/block/genhd.c b/block/genhd.c index b7e39b41a275..fd6333332ab5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -570,8 +570,8 @@ static int blk_mangle_minor(int minor) } /** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for + * blk_alloc_devt - allocate a dev_t for a block device + * @bdev: block device to allocate dev_t for * @devt: out parameter for resulting dev_t * * Allocate a dev_t for block device. @@ -583,14 +583,14 @@ static int blk_mangle_minor(int minor) * CONTEXT: * Might sleep. */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +int blk_alloc_devt(struct block_device *bdev, dev_t *devt) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = bdev->bd_disk; int idx; /* in consecutive minor range? */ - if (part->bdev->bd_partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->bdev->bd_partno); + if (bdev->bd_partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); return 0; } @@ -746,7 +746,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk, disk->flags |= GENHD_FL_UP; - retval = blk_alloc_devt(disk->part0->bd_part, &devt); + retval = blk_alloc_devt(disk->part0, &devt); if (retval) { WARN_ON(1); return; diff --git a/block/partitions/core.c b/block/partitions/core.c index 6db9ca8b722d..3d8243334c7c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -392,7 +392,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(p, &devt); + err = blk_alloc_devt(bdev, &devt); if (err) goto out_bdput; pdev->devt = devt; -- cgit v1.2.3 From 71773cf797490e1cbe4909b25a2543937e7eea82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:20:37 +0100 Subject: block: pass a block_device to invalidate_partition Pass the block_device actually needed instead of looking it up using bdget_disk. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index fd6333332ab5..452f7c646e02 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -803,14 +803,8 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) } EXPORT_SYMBOL(device_add_disk_no_queue_reg); -static void invalidate_partition(struct gendisk *disk, int partno) +static void invalidate_partition(struct block_device *bdev) { - struct block_device *bdev; - - bdev = bdget_disk(disk, partno); - if (!bdev) - return; - fsync_bdev(bdev); __invalidate_device(bdev, true); @@ -819,7 +813,6 @@ static void invalidate_partition(struct gendisk *disk, int partno) * up any more even if openers still hold references to it. */ remove_inode_hash(bdev->bd_inode); - bdput(bdev); } /** @@ -864,12 +857,12 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->bdev->bd_partno); + invalidate_partition(part->bdev); delete_partition(part); } disk_part_iter_exit(&piter); - invalidate_partition(disk, 0); + invalidate_partition(disk->part0); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; up_write(&bdev_lookup_sem); -- cgit v1.2.3 From ad1eaa5344b293552b6ba43f5709c76a9aa14d17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:52:59 +0100 Subject: block: switch disk_part_iter_* to use a struct block_device Switch the partition iter infrastructure to iterate over block_device references instead of hd_struct ones mostly used to get at the block_device. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 59 ++++++++++++++++++++++++----------------------- block/partitions/core.c | 13 +++++------ drivers/s390/block/dasd.c | 8 +++---- include/linux/genhd.h | 4 ++-- 4 files changed, 42 insertions(+), 42 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 452f7c646e02..2d34dd2da4e9 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -244,7 +244,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_init); * CONTEXT: * Don't care. */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +struct block_device *disk_part_iter_next(struct disk_part_iter *piter) { struct disk_part_tbl *ptbl; int inc, end; @@ -282,8 +282,9 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) piter->idx == 0)) continue; - get_device(part_to_dev(part->bd_part)); - piter->part = part->bd_part; + piter->part = bdgrab(part); + if (!piter->part) + continue; piter->idx += inc; break; } @@ -305,7 +306,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_next); */ void disk_part_iter_exit(struct disk_part_iter *piter) { - disk_put_part(piter->part); + if (piter->part) + bdput(piter->part); piter->part = NULL; } EXPORT_SYMBOL_GPL(disk_part_iter_exit); @@ -346,7 +348,6 @@ struct block_device *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) for (i = 1; i < ptbl->len; i++) { part = rcu_dereference(ptbl->part[i]); - if (part && sector_in_part(part, sector)) { rcu_assign_pointer(ptbl->last_lookup, part); goto out_unlock; @@ -647,7 +648,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, { struct device *ddev = disk_to_dev(disk); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; int err; ddev->parent = parent; @@ -697,7 +698,7 @@ static void register_disk(struct device *parent, struct gendisk *disk, /* announce possible partitions */ disk_part_iter_init(&piter, disk, 0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + kobject_uevent(bdev_kobj(part), KOBJ_ADD); disk_part_iter_exit(&piter); if (disk->queue->backing_dev_info->dev) { @@ -837,7 +838,7 @@ static void invalidate_partition(struct block_device *bdev) void del_gendisk(struct gendisk *disk) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; might_sleep(); @@ -857,8 +858,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(part->bdev); - delete_partition(part); + invalidate_partition(part); + delete_partition(part->bd_part); } disk_part_iter_exit(&piter); @@ -977,7 +978,7 @@ void __init printk_all_partitions(void) while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char name_buf[BDEVNAME_SIZE]; char devt_buf[BDEVT_SIZE]; @@ -996,14 +997,14 @@ void __init printk_all_partitions(void) */ disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == disk->part0->bd_part; + bool is_part0 = part == disk->part0; printk("%s%s %10llu %s %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(disk, part->bdev->bd_partno, name_buf), - part->bdev->bd_meta_info ? - part->bdev->bd_meta_info->uuid : ""); + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, + disk_name(disk, part->bd_partno, name_buf), + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) printk(" driver: %s\n", @@ -1079,7 +1080,7 @@ static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ @@ -1093,9 +1094,9 @@ static int show_partition(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - bdev_nr_sectors(part->bdev) >> 1, - disk_name(sgp, part->bdev->bd_partno, buf)); + MAJOR(part->bd_dev), MINOR(part->bd_dev), + bdev_nr_sectors(part) >> 1, + disk_name(sgp, part->bd_partno, buf)); disk_part_iter_exit(&piter); return 0; @@ -1489,7 +1490,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; struct disk_part_iter piter; - struct hd_struct *hd; + struct block_device *hd; char buf[BDEVNAME_SIZE]; unsigned int inflight; struct disk_stats stat; @@ -1504,11 +1505,11 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd, &stat); + part_stat_read_all(hd->bd_part, &stat); if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd->bdev); + inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(hd->bdev); + inflight = part_in_flight(hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " @@ -1517,8 +1518,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->bdev->bd_partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), + disk_name(gp, hd->bd_partno, buf), stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1673,7 +1674,7 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) void set_disk_ro(struct gendisk *disk, int flag) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (disk->part0->bd_read_only != flag) { set_disk_ro_uevent(disk, flag); @@ -1682,7 +1683,7 @@ void set_disk_ro(struct gendisk *disk, int flag) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - part->bdev->bd_read_only = flag; + part->bd_read_only = flag; disk_part_iter_exit(&piter); } diff --git a/block/partitions/core.c b/block/partitions/core.c index 3d8243334c7c..4cb6df175f90 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -439,15 +439,14 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; bool overlap = false; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { - if (part->bdev->bd_partno == skip_partno || - start >= part->bdev->bd_start_sect + - bdev_nr_sectors(part->bdev) || - start + length <= part->bdev->bd_start_sect) + if (part->bd_partno == skip_partno || + start >= part->bd_start_sect + bdev_nr_sectors(part) || + start + length <= part->bd_start_sect) continue; overlap = true; break; @@ -568,7 +567,7 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) int blk_drop_partitions(struct block_device *bdev) { struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (bdev->bd_part_count) return -EBUSY; @@ -578,7 +577,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part); + delete_partition(part->bd_part); disk_part_iter_exit(&piter); return 0; diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index db24e04ee978..1825fa8d05a7 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -432,7 +432,7 @@ dasd_state_ready_to_online(struct dasd_device * device) { struct gendisk *disk; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; device->state = DASD_STATE_ONLINE; if (device->block) { @@ -445,7 +445,7 @@ dasd_state_ready_to_online(struct dasd_device * device) disk = device->block->bdev->bd_disk; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE); + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); disk_part_iter_exit(&piter); } return 0; @@ -459,7 +459,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device) int rc; struct gendisk *disk; struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; if (device->discipline->online_to_ready) { rc = device->discipline->online_to_ready(device); @@ -472,7 +472,7 @@ static int dasd_state_online_to_ready(struct dasd_device *device) disk = device->block->bdev->bd_disk; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE); + kobject_uevent(bdev_kobj(part), KOBJ_CHANGE); disk_part_iter_exit(&piter); } return 0; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3c13d4708e3f..cd23c80265b2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -244,14 +244,14 @@ static inline void disk_put_part(struct hd_struct *part) struct disk_part_iter { struct gendisk *disk; - struct hd_struct *part; + struct block_device *part; int idx; unsigned int flags; }; extern void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, unsigned int flags); -extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter); +struct block_device *disk_part_iter_next(struct disk_part_iter *piter); extern void disk_part_iter_exit(struct disk_part_iter *piter); extern bool disk_has_partitions(struct gendisk *disk); -- cgit v1.2.3 From 0d02129e76edf91cf04fabf1efbc3a9a1f1d729a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:43:51 +0100 Subject: block: merge struct block_device and struct hd_struct Instead of having two structures that represent each block device with different life time rules, merge them into a single one. This also greatly simplifies the reference counting rules, as we can use the inode reference count as the main reference count for the new struct block_device, with the device model reference front ending it for device model interaction. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++-- block/blk.h | 2 +- block/genhd.c | 90 +++++++++++------------------------ block/partitions/core.c | 116 ++++++++++++++++++---------------------------- fs/block_dev.c | 9 ---- include/linux/blk_types.h | 8 +++- include/linux/blkdev.h | 1 - include/linux/genhd.h | 40 ++++------------ init/do_mounts.c | 21 ++++----- kernel/trace/blktrace.c | 43 ++++------------- 10 files changed, 108 insertions(+), 230 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79aa96240cec..031114d454a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part = disk_get_part(disk, 0); - struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = + blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk.h b/block/blk.h index 9657c6da7c77..98f0b1ae2641 100644 --- a/block/blk.h +++ b/block/blk.h @@ -356,7 +356,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct hd_struct *part); +void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 2d34dd2da4e9..0fabfc90b8e4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -106,13 +106,14 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -167,39 +168,6 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno) return rcu_dereference(ptbl->part[partno]); } -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct block_device *bdev; - struct hd_struct *part; - - rcu_read_lock(); - bdev = __disk_get_part(disk, partno); - if (!bdev) - goto fail; - part = bdev->bd_part; - if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) - goto fail; - rcu_read_unlock(); - return part; -fail: - rcu_read_unlock(); - return NULL; -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -859,7 +827,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); - delete_partition(part->bd_part); + delete_partition(part); } disk_part_iter_exit(&piter); @@ -952,13 +920,13 @@ void blk_request_module(dev_t devt) */ struct block_device *bdget_disk(struct gendisk *disk, int partno) { - struct hd_struct *part; struct block_device *bdev = NULL; - part = disk_get_part(disk, partno); - if (part) - bdev = bdget_part(part); - disk_put_part(part); + rcu_read_lock(); + bdev = __disk_get_part(disk, partno); + if (bdev && !bdgrab(bdev)) + bdev = NULL; + rcu_read_unlock(); return bdev; } @@ -1175,24 +1143,22 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); + part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p->bdev); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(p->bdev); + inflight = part_in_flight(bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1227,14 +1193,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p->bdev, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(p->bdev, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1282,20 +1248,17 @@ static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->bdev->bd_make_it_fail = i; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1505,7 +1468,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd->bd_part, &stat); + part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else @@ -1577,7 +1540,7 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; + struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1590,13 +1553,12 @@ dev_t blk_lookup_devt(const char *name, int partno) MINOR(dev->devt) + partno); break; } - part = disk_get_part(disk, partno); + part = bdget_disk(disk, partno); if (part) { - devt = part_devt(part); - disk_put_part(part); + devt = part->bd_dev; + bdput(part); break; } - disk_put_part(part); } class_dev_iter_exit(&iter); return devt; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4cb6df175f90..deca253583bd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -182,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->bdev->bd_read_only); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -264,20 +259,17 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - bdput(p->bdev); + bdput(dev_to_bdev(dev)); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); - if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", - part->bdev->bd_meta_info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } @@ -292,25 +284,25 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct hd_struct *part) +void delete_partition(struct block_device *part) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = part->bd_disk; struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); + rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->bdev->bd_holder_dir); - device_del(part_to_dev(part)); + kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. */ - remove_inode_hash(part->bdev->bd_inode); + remove_inode_hash(part->bd_inode); - put_device(part_to_dev(part)); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -324,11 +316,10 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; @@ -367,9 +358,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!bdev) return ERR_PTR(-ENOMEM); - p = bdev->bd_part; - pdev = part_to_dev(p); - bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); bdev->bd_read_only = get_disk_ro(disk); @@ -381,6 +369,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_bdput; } + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -422,7 +411,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; + return bdev; out_bdput: bdput(bdev); @@ -459,7 +448,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct hd_struct *part; + struct block_device *part; mutex_lock(&bdev->bd_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { @@ -475,76 +464,59 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *bdevp; - struct hd_struct *part = NULL; + struct block_device *part; int ret; - bdevp = bdget_disk(bdev->bd_disk, partno); - if (!bdevp) + part = bdget_disk(bdev->bd_disk, partno); + if (!part) return -ENXIO; - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); - if (!part) - goto out_unlock; - ret = -EBUSY; - if (bdevp->bd_openers) + if (part->bd_openers) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); + sync_blockdev(part); + invalidate_bdev(part); delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - if (part) - disk_put_part(part); + mutex_unlock(&part->bd_mutex); + bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; + struct block_device *part; int ret = 0; - part = disk_get_part(bdev->bd_disk, partno); + part = bdget_disk(bdev->bd_disk, partno); if (!part) return -ENXIO; - ret = -ENOMEM; - bdevp = bdget_part(part); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -EINVAL; - if (start != part->bdev->bd_start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - bdev_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&part->bd_mutex); mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + bdput(part); return ret; } @@ -577,7 +549,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part->bd_part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; @@ -592,7 +564,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -632,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 61cf33b6284f..a9905d8fd02b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -39,7 +39,6 @@ struct bdev_inode { struct block_device bdev; - struct hd_struct hd; struct inode vfs_inode; }; @@ -887,9 +886,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } - bdev->bd_part = &BDEV_I(inode)->hd; - memset(bdev->bd_part, 0, sizeof(*bdev->bd_part)); - bdev->bd_part->bdev = bdev; return bdev; } @@ -926,11 +922,6 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); -struct block_device *bdget_part(struct hd_struct *part) -{ - return bdget(part_devt(part)); -} - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6edea5c16259..866f74261b3b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -8,6 +8,7 @@ #include #include +#include #include struct bio_set; @@ -30,6 +31,7 @@ struct block_device { struct super_block * bd_super; struct mutex bd_mutex; /* open/close mutex */ void * bd_claiming; + struct device bd_device; void * bd_holder; int bd_holders; bool bd_write_holder; @@ -38,7 +40,6 @@ struct block_device { #endif struct kobject *bd_holder_dir; u8 bd_partno; - struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; @@ -61,8 +62,11 @@ struct block_device { #define bdev_whole(_bdev) \ ((_bdev)->bd_disk->part0) +#define dev_to_bdev(device) \ + container_of((device), struct block_device, bd_device) + #define bdev_kobj(_bdev) \ - (&part_to_dev((_bdev)->bd_part)->kobj) + (&((_bdev)->bd_device.kobj)) /* * Block error status values. See block/blk-core:blk_errors for the details. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d4be1fc6007..17cedf0dc83d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,7 +1999,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index cd23c80265b2..809aaa32d53c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -19,12 +19,6 @@ #include #include -#define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define part_to_dev(part) (&((part)->__dev)) - -#define dev_to_disk(device) (dev_to_part(device)->bdev->bd_disk) -#define disk_to_dev(disk) (part_to_dev((disk)->part0->bd_part)) - extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; @@ -51,11 +45,6 @@ struct partition_meta_info { u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; -struct hd_struct { - struct block_device *bdev; - struct device __dev; -}; - /** * DOC: genhd capability flags * @@ -190,19 +179,21 @@ struct gendisk { struct lockdep_map lockdep_map; }; +/* + * The gendisk is refcounted by the part0 block_device, and the bd_device + * therein is also used for device model presentation in sysfs. + */ +#define dev_to_disk(device) \ + (dev_to_bdev(device)->bd_disk) +#define disk_to_dev(disk) \ + (&((disk)->part0->bd_device)) + #if IS_REACHABLE(CONFIG_CDROM) #define disk_to_cdi(disk) ((disk)->cdi) #else #define disk_to_cdi(disk) NULL #endif -static inline struct gendisk *part_to_disk(struct hd_struct *part) -{ - if (unlikely(!part)) - return NULL; - return part->bdev->bd_disk; -} - static inline int disk_max_parts(struct gendisk *disk) { if (disk->flags & GENHD_FL_EXT_DEVT) @@ -221,19 +212,6 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } -static inline dev_t part_devt(struct hd_struct *part) -{ - return part_to_dev(part)->devt; -} - -extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); - -static inline void disk_put_part(struct hd_struct *part) -{ - if (likely(part)) - put_device(part_to_dev(part)); -} - /* * Smarter partition iterator without context limits. */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 86bef93e72eb..a78e44ee6adb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -76,11 +76,11 @@ struct uuidcmp { */ static int match_dev_by_uuid(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const struct uuidcmp *cmp = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len)) + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) return 0; return 1; } @@ -133,13 +133,13 @@ static dev_t devt_from_partuuid(const char *uuid_str) * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - struct hd_struct *part; + struct block_device *part; - part = disk_get_part(dev_to_disk(dev), - dev_to_part(dev)->bdev->bd_partno + offset); + part = bdget_disk(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); if (part) { - devt = part_devt(part); - put_device(part_to_dev(part)); + devt = part->bd_dev; + bdput(part); } } else { devt = dev->devt; @@ -166,11 +166,10 @@ clear_root_wait: */ static int match_dev_by_label(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const char *label = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strcmp(label, part->bdev->bd_meta_info->volname)) + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) return 0; return 1; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8a723a91ec5a..a482a37848bf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1810,30 +1810,15 @@ static ssize_t blk_trace_mask2str(char *buf, int mask) return p - buf; } -static struct request_queue *blk_trace_get_queue(struct block_device *bdev) -{ - if (bdev->bd_disk == NULL) - return NULL; - - return bdev_get_queue(bdev); -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = bdget_part(dev_to_part(dev)); - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; - mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, @@ -1856,9 +1841,6 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); -out: return ret; } @@ -1866,8 +1848,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct block_device *bdev; - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1883,17 +1865,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; value = ret; } - } else if (kstrtoull(buf, 0, &value)) - goto out; - - ret = -ENXIO; - bdev = bdget_part(dev_to_part(dev)); - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; + } else { + if (kstrtoull(buf, 0, &value)) + goto out; + } mutex_lock(&q->debugfs_mutex); @@ -1931,8 +1906,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); out: return ret ? ret : count; } -- cgit v1.2.3 From 977115c0f664e016a6b2774d4f97116ade23d732 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 10:41:07 +0100 Subject: block: stop using bdget_disk for partition 0 We can just dereference the point in struct gendisk instead. Also remove the now unused export. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/genhd.c | 1 - drivers/block/nbd.c | 4 +--- drivers/block/xen-blkfront.c | 20 +++++--------------- drivers/block/zram/zram_drv.c | 14 ++------------ drivers/md/dm.c | 16 ++-------------- drivers/s390/block/dasd_ioctl.c | 5 ++--- fs/block_dev.c | 2 +- 7 files changed, 13 insertions(+), 49 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index 0fabfc90b8e4..b84b8671e627 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -930,7 +930,6 @@ struct block_device *bdget_disk(struct gendisk *disk, int partno) return bdev; } -EXPORT_SYMBOL(bdget_disk); /* * print a full list of all partitions - intended for places where the root diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 014683968ce1..92f84ed0ba9e 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1488,12 +1488,10 @@ out: static void nbd_release(struct gendisk *disk, fmode_t mode) { struct nbd_device *nbd = disk->private_data; - struct block_device *bdev = bdget_disk(disk, 0); if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) && - bdev->bd_openers == 0) + disk->part0->bd_openers == 0) nbd_disconnect_and_put(nbd); - bdput(bdev); nbd_config_put(nbd); nbd_put(nbd); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 79521e33d30e..188e0b47534b 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2153,7 +2153,7 @@ static void blkfront_closing(struct blkfront_info *info) } if (info->gd) - bdev = bdget_disk(info->gd, 0); + bdev = bdgrab(info->gd->part0); mutex_unlock(&info->mutex); @@ -2518,7 +2518,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) disk = info->gd; if (disk) - bdev = bdget_disk(disk, 0); + bdev = bdgrab(disk->part0); info->xbdev = NULL; mutex_unlock(&info->mutex); @@ -2595,19 +2595,11 @@ out: static void blkif_release(struct gendisk *disk, fmode_t mode) { struct blkfront_info *info = disk->private_data; - struct block_device *bdev; struct xenbus_device *xbdev; mutex_lock(&blkfront_mutex); - - bdev = bdget_disk(disk, 0); - - if (!bdev) { - WARN(1, "Block device %s yanked out from us!\n", disk->disk_name); + if (disk->part0->bd_openers) goto out_mutex; - } - if (bdev->bd_openers) - goto out; /* * Check if we have been instructed to close. We will have @@ -2619,7 +2611,7 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) if (xbdev && xbdev->state == XenbusStateClosing) { /* pending switch to state closed */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + dev_info(disk_to_dev(disk), "releasing disk\n"); xlvbd_release_gendisk(info); xenbus_frontend_closed(info->xbdev); } @@ -2628,14 +2620,12 @@ static void blkif_release(struct gendisk *disk, fmode_t mode) if (!xbdev) { /* sudden device removal */ - dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n"); + dev_info(disk_to_dev(disk), "releasing disk\n"); xlvbd_release_gendisk(info); disk->private_data = NULL; free_info(info); } -out: - bdput(bdev); out_mutex: mutex_unlock(&blkfront_mutex); } diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index dc8957d173d3..b0701bae6e98 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1760,15 +1760,12 @@ static ssize_t reset_store(struct device *dev, return -EINVAL; zram = dev_to_zram(dev); - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + bdev = zram->disk->part0; mutex_lock(&bdev->bd_mutex); /* Do not reset an active device or claimed device */ if (bdev->bd_openers || zram->claim) { mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return -EBUSY; } @@ -1779,7 +1776,6 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - bdput(bdev); mutex_lock(&bdev->bd_mutex); zram->claim = false; @@ -1965,16 +1961,11 @@ out_free_dev: static int zram_remove(struct zram *zram) { - struct block_device *bdev; - - bdev = bdget_disk(zram->disk, 0); - if (!bdev) - return -ENOMEM; + struct block_device *bdev = zram->disk->part0; mutex_lock(&bdev->bd_mutex); if (bdev->bd_openers || zram->claim) { mutex_unlock(&bdev->bd_mutex); - bdput(bdev); return -EBUSY; } @@ -1986,7 +1977,6 @@ static int zram_remove(struct zram *zram) /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - bdput(bdev); pr_info("Removed device: %s\n", zram->disk->disk_name); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 176adcff56b3..ed7e836efbcd 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2375,16 +2375,11 @@ out: */ static int lock_fs(struct mapped_device *md) { - struct block_device *bdev; int r; WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - bdev = bdget_disk(md->disk, 0); - if (!bdev) - return -ENOMEM; - r = freeze_bdev(bdev); - bdput(bdev); + r = freeze_bdev(md->disk->part0); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; @@ -2392,16 +2387,9 @@ static int lock_fs(struct mapped_device *md) static void unlock_fs(struct mapped_device *md) { - struct block_device *bdev; - if (!test_bit(DMF_FROZEN, &md->flags)) return; - - bdev = bdget_disk(md->disk, 0); - if (!bdev) - return; - thaw_bdev(bdev); - bdput(bdev); + thaw_bdev(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 304eba1acf16..9f6424408946 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -220,9 +220,8 @@ dasd_format(struct dasd_block *block, struct format_data_t *fdata) * enabling the device later. */ if (fdata->start_unit == 0) { - struct block_device *bdev = bdget_disk(block->gdp, 0); - bdev->bd_inode->i_blkbits = blksize_bits(fdata->blksize); - bdput(bdev); + block->gdp->part0->bd_inode->i_blkbits = + blksize_bits(fdata->blksize); } rc = base->discipline->format_device(base, fdata, 1); diff --git a/fs/block_dev.c b/fs/block_dev.c index a9905d8fd02b..9e56ee1f2652 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1299,7 +1299,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (ret) return ret; } else { - struct block_device *whole = bdget_disk(disk, 0); + struct block_device *whole = bdgrab(disk->part0); mutex_lock_nested(&whole->bd_mutex, 1); ret = __blkdev_get(whole, mode); -- cgit v1.2.3 From 22b56c2964386ddced252be407150b22f85e209e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 24 Nov 2020 17:58:13 +0000 Subject: bio: optimise bvec iteration __bio_for_each_bvec(), __bio_for_each_segment() and bio_copy_data_iter() fall under conditions of bvec_iter_advance_single(), which is a faster and slimmer version of bvec_iter_advance(). Add bio_advance_iter_single() and convert them. Signed-off-by: Pavel Begunkov Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bio.c | 4 ++-- include/linux/bio.h | 17 +++++++++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/bio.c b/block/bio.c index ebb18136b86f..1f2cc1fbe283 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1212,8 +1212,8 @@ void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, flush_dcache_page(dst_bv.bv_page); - bio_advance_iter(src, src_iter, bytes); - bio_advance_iter(dst, dst_iter, bytes); + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); diff --git a/include/linux/bio.h b/include/linux/bio.h index ecf67108f091..1edda614f7ce 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -148,11 +148,24 @@ static inline void bio_advance_iter(const struct bio *bio, /* TODO: It is reasonable to complete bio with error here. */ } +/* @bytes should be less or equal to bvec[i->bi_idx].bv_len */ +static inline void bio_advance_iter_single(const struct bio *bio, + struct bvec_iter *iter, + unsigned int bytes) +{ + iter->bi_sector += bytes >> 9; + + if (bio_no_advance_iter(bio)) + iter->bi_size -= bytes; + else + bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); +} + #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bio_iter_iovec((bio), (iter))), 1); \ - bio_advance_iter((bio), &(iter), (bvl).bv_len)) + bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) @@ -161,7 +174,7 @@ static inline void bio_advance_iter(const struct bio *bio, for (iter = (start); \ (iter).bi_size && \ ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \ - bio_advance_iter((bio), &(iter), (bvl).bv_len)) + bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) /* iterate over multi-page bvec */ #define bio_for_each_bvec(bvl, bio, iter) \ -- cgit v1.2.3 From b0d97557ebfc9d5ba5f2939339a9fdd267abafeb Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Wed, 2 Dec 2020 19:11:45 +0800 Subject: block: fix inflight statistics of part0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inflight of partition 0 doesn't include inflight IOs to all sub-partitions, since currently mq calculates inflight of specific partition by simply camparing the value of the partition pointer. Thus the following case is possible: $ cat /sys/block/vda/inflight        0        0 $ cat /sys/block/vda/vda1/inflight        0      128 While single queue device (on a previous version, e.g. v3.10) has no this issue: $cat /sys/block/sda/sda3/inflight 0 33 $cat /sys/block/sda/inflight 0 33 Partition 0 should be specially handled since it represents the whole disk. This issue is introduced since commit bf0ddaba65dd ("blk-mq: fix sysfs inflight counter"). Besides, this patch can also fix the inflight statistics of part 0 in /proc/diskstats. Before this patch, the inflight statistics of part 0 doesn't include that of sub partitions. (I have marked the 'inflight' field with asterisk.) $cat /proc/diskstats 259 0 nvme0n1 45974469 0 367814768 6445794 1 0 1 0 *0* 111062 6445794 0 0 0 0 0 0 259 2 nvme0n1p1 45974058 0 367797952 6445727 0 0 0 0 *33* 111001 6445727 0 0 0 0 0 0 This is introduced since commit f299b7c7a9de ("blk-mq: provide internal in-flight variant"). Fixes: bf0ddaba65dd ("blk-mq: fix sysfs inflight counter") Fixes: f299b7c7a9de ("blk-mq: provide internal in-flight variant") Signed-off-by: Jeffle Xu Reviewed-by: Christoph Hellwig [axboe: adapt for 5.11 partition change] Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index a2593748fa53..37c682855a63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -105,7 +105,8 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, { struct mq_inflight *mi = priv; - if (rq->part == mi->part && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) + if ((!mi->part->bd_partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; -- cgit v1.2.3 From acaf523a7bf226b28504306c1cfee194520123b3 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 26 Nov 2020 11:18:34 +0800 Subject: blk-throttle: don't check whether or not lower limit is valid if CONFIG_BLK_DEV_THROTTLING_LOW is off blk_throtl_update_limit_valid() will search for descendants to see if 'LIMIT_LOW' of bps/iops and READ/WRITE is nonzero. However, they're always zero if CONFIG_BLK_DEV_THROTTLING_LOW is not set, furthermore, a lot of time will be wasted to iterate descendants. Thus do nothing in blk_throtl_update_limit_valid() in such situation. Signed-off-by: Yu Kuai Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-throttle.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/blk-throttle.c b/block/blk-throttle.c index b771c4299982..d52cac9f3a7c 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -587,6 +587,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -607,6 +608,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) -- cgit v1.2.3 From e8a676d61c07eccfcd9d6fddfe4dcb630651c29a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:36 +0100 Subject: block: simplify and extend the block_bio_merge tracepoint class The block_bio_merge tracepoint class can be reused for most bio-based tracepoints. For that it just needs to lose the superfluous q and rq parameters. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 4 +- block/blk-mq.c | 2 +- block/bounce.c | 2 +- include/trace/events/block.h | 158 +++++++++---------------------------------- kernel/trace/blktrace.c | 41 +++-------- 6 files changed, 48 insertions(+), 161 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index cee568389b7e..cb24654983e1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -907,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); + trace_block_bio_queue(bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index cb351ab9b77d..1a46d5bbd399 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -922,7 +922,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_backmerge(req->q, req, bio); + trace_block_bio_backmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) @@ -946,7 +946,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_frontmerge(req->q, req, bio); + trace_block_bio_frontmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37c682855a63..21e2b4b6b742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2184,7 +2184,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - trace_block_getrq(q, bio, bio->bi_opf); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); diff --git a/block/bounce.c b/block/bounce.c index 162a6eee8999..d3f51acd6e3b 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -340,7 +340,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, } } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(*bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 76459cf750e1..506c29dc7c76 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -226,45 +226,6 @@ DEFINE_EVENT(block_rq, block_rq_merge, TP_ARGS(q, rq) ); -/** - * block_bio_bounce - used bounce buffer when processing block operation - * @q: queue holding the block operation - * @bio: block operation - * - * A bounce buffer was used to handle the block operation @bio in @q. - * This occurs when hardware limitations prevent a direct transfer of - * data between the @bio data memory area and the IO device. Use of a - * bounce buffer requires extra copying of data and decreases - * performance. - */ -TRACE_EVENT(block_bio_bounce, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation @@ -301,11 +262,11 @@ TRACE_EVENT(block_bio_complete, __entry->nr_sector, __entry->error) ); -DECLARE_EVENT_CLASS(block_bio_merge, +DECLARE_EVENT_CLASS(block_bio, - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), + TP_PROTO(struct bio *bio), - TP_ARGS(q, rq, bio), + TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -329,116 +290,63 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->nr_sector, __entry->comm) ); +/** + * block_bio_bounce - used bounce buffer when processing block operation + * @bio: block operation + * + * A bounce buffer was used to handle the block operation @bio in @q. + * This occurs when hardware limitations prevent a direct transfer of + * data between the @bio data memory area and the IO device. Use of a + * bounce buffer requires extra copying of data and decreases + * performance. + */ +DEFINE_EVENT(block_bio, block_bio_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /** * block_bio_backmerge - merging block operation to the end of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block request @bio to the end of an existing block request - * in queue @q. + * Merging block request @bio to the end of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_backmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_backmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block IO operation @bio to the beginning of an existing block - * operation in queue @q. + * Merging block IO operation @bio to the beginning of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_frontmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_frontmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue - * @q: queue holding operation * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ -TRACE_EVENT(block_bio_queue, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - -DECLARE_EVENT_CLASS(block_get_rq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio ? bio_dev(bio) : 0; - __entry->sector = bio ? bio->bi_iter.bi_sector : 0; - __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, - bio ? bio->bi_opf : 0, __entry->nr_sector); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) +DEFINE_EVENT(block_bio, block_bio_queue, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations - * @q: queue for operations * @bio: pending block IO operation (can be %NULL) - * @rw: low bit indicates a read (%0) or a write (%1) * - * A request struct for queue @q has been allocated to handle the - * block IO operation @bio. + * A request struct has been allocated to handle the block IO operation @bio. */ -DEFINE_EVENT(block_get_rq, block_getrq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw) +DEFINE_EVENT(block_bio, block_getrq, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ced589df304b..7ab88e00c157 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -906,10 +906,9 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -919,44 +918,24 @@ static void blk_add_trace_bio_complete(void *ignore, blk_status_to_errno(bio->bi_status)); } -static void blk_add_trace_bio_backmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); } -static void blk_add_trace_bio_frontmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } -static void blk_add_trace_bio_queue(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); } -static void blk_add_trace_getrq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) +static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); - else { - struct blk_trace *bt; - - rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); - if (bt) - __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, 0); - rcu_read_unlock(); - } + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) -- cgit v1.2.3 From eb6f7f7cd3af0f67ce57b21fab1bc64beb643581 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:37 +0100 Subject: block: remove the request_queue argument to the block_split tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- drivers/md/dm.c | 2 +- include/trace/events/block.h | 14 ++++++-------- kernel/trace/blktrace.c | 5 ++--- 4 files changed, 10 insertions(+), 13 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 1a46d5bbd399..4071daa88a5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -338,7 +338,7 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ed7e836efbcd..9a5bd90779c7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,7 +1612,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, part_stat_unlock(); bio_chain(b, bio); - trace_block_split(md->queue, b, bio->bi_iter.bi_sector); + trace_block_split(b, bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); break; } diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 506c29dc7c76..b415e4cba843 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -411,21 +411,19 @@ DEFINE_EVENT(block_unplug, block_unplug, /** * block_split - split a single bio struct into two bio structs - * @q: queue containing the bio * @bio: block operation being split * @new_sector: The starting sector for the new bio * - * The bio request @bio in request queue @q needs to be split into two - * bio requests. The newly created @bio request starts at - * @new_sector. This split may be required due to hardware limitation - * such as operation crossing device boundaries in a RAID system. + * The bio request @bio needs to be split into two bio requests. The newly + * created @bio request starts at @new_sector. This split may be required due to + * hardware limitations such as operation crossing device boundaries in a RAID + * system. */ TRACE_EVENT(block_split, - TP_PROTO(struct request_queue *q, struct bio *bio, - unsigned int new_sector), + TP_PROTO(struct bio *bio, unsigned int new_sector), - TP_ARGS(q, bio, new_sector), + TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ab88e00c157..3ca6d62114f4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -970,10 +970,9 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } -static void blk_add_trace_split(void *ignore, - struct request_queue *q, struct bio *bio, - unsigned int pdu) +static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; rcu_read_lock(); -- cgit v1.2.3 From 1c02fca620f7273b597591065d366e2cca948d8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:38 +0100 Subject: block: remove the request_queue argument to the block_bio_remap tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/md/dm.c | 3 +-- drivers/md/md-linear.c | 3 +-- drivers/md/md.c | 5 ++--- drivers/md/raid0.c | 4 ++-- drivers/md/raid1.c | 7 +++---- drivers/md/raid10.c | 6 ++---- drivers/md/raid5.c | 15 +++++++-------- drivers/nvme/host/multipath.c | 3 +-- include/trace/events/block.h | 8 +++----- kernel/trace/blktrace.c | 14 +++++--------- 11 files changed, 28 insertions(+), 42 deletions(-) (limited to 'block') diff --git a/block/blk-core.c b/block/blk-core.c index cb24654983e1..96e5fcd7f071 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -758,7 +758,7 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; bio->bi_iter.bi_sector += p->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, + trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9a5bd90779c7..5181907dc595 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1276,8 +1276,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(clone->bi_disk->queue, clone, - bio_dev(io->orig_bio), sector); + trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 98f1b4b2bdce..68cac7d19278 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -257,8 +257,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, - bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0065736f05b4..c555be0a8dce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8591,9 +8591,8 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); + trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL(md_submit_discard_bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f44177593a5..e5d7411cba9b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -571,8 +571,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(mddev->gendisk), bio_sector); + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 960d854c07f8..c0347997f6ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1305,8 +1305,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, - disk_devt(mddev->gendisk), r1_bio->sector); + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1517,8 +1517,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)conf->mirrors[i].rdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..a6f99fa0b32c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1200,8 +1200,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, - read_bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), r10_bio->sector); submit_bio_noacct(read_bio); return; @@ -1250,8 +1249,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(conf->mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)rdev; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39343479ac2a..3a90cc0e43ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1222,9 +1222,9 @@ again: set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bi->bi_disk->queue, - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(bi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1272,9 +1272,9 @@ again: if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(rbi->bi_disk->queue, - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(rbi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -5468,8 +5468,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(align_bi->bi_disk->queue, - align_bi, disk_devt(mddev->gendisk), + trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bi); return 1; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c17..106cf5c44ee7 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,8 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(ns->head->disk), + trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); } else if (nvme_available_path(head)) { diff --git a/include/trace/events/block.h b/include/trace/events/block.h index b415e4cba843..8fb89574d867 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -450,9 +450,8 @@ TRACE_EVENT(block_split, /** * block_bio_remap - map request for a logical device to the raw device - * @q: queue holding the operation * @bio: revised operation - * @dev: device for the operation + * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the @@ -460,10 +459,9 @@ TRACE_EVENT(block_split, */ TRACE_EVENT(block_bio_remap, - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from), + TP_PROTO(struct bio *bio, dev_t dev, sector_t from), - TP_ARGS(q, bio, dev, from), + TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3ca6d62114f4..405637144a03 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -993,20 +993,16 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) - * @q: queue the io is for * @bio: the source bio - * @dev: target device + * @dev: source device * @from: source sector * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * + * Called after a bio is remapped to a different device and/or sector. **/ -static void blk_add_trace_bio_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, + sector_t from) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; -- cgit v1.2.3 From a54895fa057c67700270777f7661d8d3c7fda88a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:39 +0100 Subject: block: remove the request_queue to argument request based tracepoints The request_queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-mq-sched.c | 2 +- block/blk-mq.c | 8 ++++---- drivers/md/dm-rq.c | 2 +- drivers/s390/scsi/zfcp_fsf.c | 3 +-- include/linux/blktrace_api.h | 5 ++--- include/trace/events/block.h | 30 ++++++++++++------------------ kernel/trace/blktrace.c | 44 +++++++++++++++++--------------------------- 8 files changed, 39 insertions(+), 57 deletions(-) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 4071daa88a5e..7497d86fff38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -799,7 +799,7 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); - trace_block_rq_merge(q, next); + trace_block_rq_merge(next); /* * ownership of bio passed from next to req, return 'next' for diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d1eafe2c045c..deff4e826e23 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); void blk_mq_sched_request_inserted(struct request *rq) { - trace_block_rq_insert(rq->q, rq); + trace_block_rq_insert(rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); diff --git a/block/blk-mq.c b/block/blk-mq.c index 21e2b4b6b742..cf3916e2852f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -733,7 +733,7 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -760,7 +760,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -1821,7 +1821,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, lockdep_assert_held(&ctx->lock); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_lists[type]); @@ -1878,7 +1878,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); } spin_lock(&ctx->lock); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 729a72ec30cc..13b4385f4d5a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -397,7 +397,7 @@ static int map_request(struct dm_rq_target_io *tio) } /* The target has remapped the I/O so dispatch it */ - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + trace_block_rq_remap(clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 6cb963a06777..37d450f46952 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2359,8 +2359,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi) } } - blk_add_driver_data(scsi->request->q, scsi->request, &blktrc, - sizeof(blktrc)); + blk_add_driver_data(scsi->request, &blktrc, sizeof(blktrc)); } /** diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 3b6ff5902edc..05556573b896 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -75,8 +75,7 @@ static inline bool blk_trace_note_message_enabled(struct request_queue *q) return ret; } -extern void blk_add_driver_data(struct request_queue *q, struct request *rq, - void *data, size_t len); +extern void blk_add_driver_data(struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg); @@ -90,7 +89,7 @@ extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ # define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) # define blk_trace_shutdown(q) do { } while (0) -# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_add_driver_data(rq, data, len) do {} while (0) # define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8fb89574d867..0d782663a005 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -64,7 +64,6 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, /** * block_rq_requeue - place block IO request back on a queue - * @q: queue holding operation * @rq: block IO operation request * * The block operation request @rq is being placed back into queue @@ -73,9 +72,9 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, */ TRACE_EVENT(block_rq_requeue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -147,9 +146,9 @@ TRACE_EVENT(block_rq_complete, DECLARE_EVENT_CLASS(block_rq, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -181,7 +180,6 @@ DECLARE_EVENT_CLASS(block_rq, /** * block_rq_insert - insert block operation request into queue - * @q: target queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted @@ -191,14 +189,13 @@ DECLARE_EVENT_CLASS(block_rq, */ DEFINE_EVENT(block_rq, block_rq_insert, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is sent to a @@ -206,14 +203,13 @@ DEFINE_EVENT(block_rq, block_rq_insert, */ DEFINE_EVENT(block_rq, block_rq_issue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is merged to another @@ -221,9 +217,9 @@ DEFINE_EVENT(block_rq, block_rq_issue, */ DEFINE_EVENT(block_rq, block_rq_merge, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** @@ -491,7 +487,6 @@ TRACE_EVENT(block_bio_remap, /** * block_rq_remap - map request for a block operation request - * @q: queue holding the operation * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation @@ -502,10 +497,9 @@ TRACE_EVENT(block_bio_remap, */ TRACE_EVENT(block_rq_remap, - TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, - sector_t from), + TP_PROTO(struct request *rq, dev_t dev, sector_t from), - TP_ARGS(q, rq, dev, from), + TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 405637144a03..7839a78205c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -795,12 +795,12 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) #endif static u64 -blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ - return blk_trace_bio_get_cgid(q, rq->bio); + return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* @@ -841,40 +841,35 @@ static void blk_add_trace_rq(struct request *rq, int error, rcu_read_unlock(); } -static void blk_add_trace_rq_insert(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_issue(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_merge(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_requeue(void *ignore, - struct request_queue *q, - struct request *rq) +static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, - blk_trace_request_get_cgid(rq->q, rq)); + blk_trace_request_get_cgid(rq)); } /** @@ -1037,16 +1032,14 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, * Add a trace for that action. * **/ -static void blk_add_trace_rq_remap(void *ignore, - struct request_queue *q, - struct request *rq, dev_t dev, +static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1058,13 +1051,12 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for * @rq: io request * @data: driver-specific data * @len: length of driver-specific data @@ -1073,14 +1065,12 @@ static void blk_add_trace_rq_remap(void *ignore, * Some drivers might want to write driver-specific data per request. * **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) +void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1088,7 +1078,7 @@ void blk_add_driver_data(struct request_queue *q, __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); -- cgit v1.2.3 From 5ba1add216fe82289769045627d97f233bbcc645 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:11 +0800 Subject: blk-iocost: Fix some typos in comments Fix some typos in comments. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8e20fe4bddec..087ae215529e 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -492,7 +492,7 @@ struct ioc_gq { /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather @@ -1046,7 +1046,7 @@ static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -2400,7 +2400,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, return cost; /* - * We only increase inuse during period and do so iff the margin has + * We only increase inuse during period and do so if the margin has * deteriorated since the previous adjustment. */ if (margin >= iocg->saved_margin || margin >= margins->low || -- cgit v1.2.3 From 647c9f03b2b66cf1f505208c313998fc833ed28b Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:12 +0800 Subject: blk-iocost: Remove unnecessary advance declaration Remove unnecessary advance declaration of struct ioc_gq. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 087ae215529e..ec4865206353 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -370,8 +370,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; -- cgit v1.2.3 From c09245f61c6ac4ef253a5fcf97e5bcfc0ce25fc7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:13 +0800 Subject: blk-iocost: Move the usage ratio calculation to the correct place We only use the hweight based usage ratio to calculate the new hweight_inuse of the iocg to decide if this iocg can donate some surplus vtime. Thus move the usage ratio calculation to the correct place to avoid unnecessary calculation for some vtime shortage iocgs. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ec4865206353..09f22f9a6ba4 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2168,8 +2168,8 @@ static void ioc_timer_fn(struct timer_list *timer) /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, usage_us, usage_dur; - u32 usage, hw_active, hw_inuse; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -2200,30 +2200,32 @@ static void ioc_timer_fn(struct timer_list *timer) usage_us = iocg->usage_delta_us; usage_us_sum += usage_us; - if (vdone != vtime) { - u64 inflight_us = DIV64_U64_ROUND_UP( - cost_to_abs_cost(vtime - vdone, hw_inuse), - ioc->vtime_base_rate); - usage_us = max(usage_us, inflight_us); - } - - /* convert to hweight based usage ratio */ - if (time_after64(iocg->activated_at, ioc->period_at)) - usage_dur = max_t(u64, now.now - iocg->activated_at, 1); - else - usage_dur = max_t(u64, now.now - ioc->period_at, 1); - - usage = clamp_t(u32, - DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, - usage_dur), - 1, WEIGHT_ONE); - /* see whether there's surplus vtime */ WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); if (hw_inuse < hw_active || (!waitqueue_active(&iocg->waitq) && time_before64(vtime, now.vnow - ioc->margins.low))) { - u32 hwa, old_hwi, hwm, new_hwi; + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); /* * Already donating or accumulated enough to start. -- cgit v1.2.3 From 2474787a75b4f358e81f367653c73edecd67aa2d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:14 +0800 Subject: blk-iocost: Factor out the active iocgs' state check into a separate function Factor out the iocgs' state check into a separate function to simplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 94 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 54 insertions(+), 40 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 09f22f9a6ba4..7dd1424d5833 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2069,40 +2069,21 @@ static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, } } -static void ioc_timer_fn(struct timer_list *timer) +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) { - struct ioc *ioc = container_of(timer, struct ioc, timer); + int nr_debtors = 0; struct ioc_gq *iocg, *tiocg; - struct ioc_now now; - LIST_HEAD(surpluses); - int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; - u64 usage_us_sum = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; - u32 missed_ppm[2], rq_wait_pct; - u64 period_vtime; - int prev_busy_level; - - /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); - /* take care of active iocgs */ - spin_lock_irq(&ioc->lock); - - ioc_now(ioc, &now); - - period_vtime = now.vnow - ioc->period_at_vtime; - if (WARN_ON_ONCE(!period_vtime)) { - spin_unlock_irq(&ioc->lock); - return; - } - - /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. - */ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && !iocg->delay && !iocg_is_idle(iocg)) @@ -2112,24 +2093,24 @@ static void ioc_timer_fn(struct timer_list *timer) /* flush wait and indebt stat deltas */ if (iocg->wait_since) { - iocg->local_stat.wait_us += now.now - iocg->wait_since; - iocg->wait_since = now.now; + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; } if (iocg->indebt_since) { iocg->local_stat.indebt_us += - now.now - iocg->indebt_since; - iocg->indebt_since = now.now; + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; } if (iocg->indelay_since) { iocg->local_stat.indelay_us += - now.now - iocg->indelay_since; - iocg->indelay_since = now.now; + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; } if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, true, &now); + iocg_kick_waitq(iocg, true, now); if (iocg->abs_vdebt || iocg->delay) nr_debtors++; } else if (iocg_is_idle(iocg)) { @@ -2143,7 +2124,7 @@ static void ioc_timer_fn(struct timer_list *timer) * error and throw away. On reactivation, it'll start * with the target budget. */ - excess = now.vnow - vtime - ioc->margins.target; + excess = now->vnow - vtime - ioc->margins.target; if (excess > 0) { u32 old_hwi; @@ -2152,13 +2133,46 @@ static void ioc_timer_fn(struct timer_list *timer) WEIGHT_ONE); } - __propagate_weights(iocg, 0, 0, false, &now); + __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } + commit_weights(ioc); + return nr_debtors; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + nr_debtors = ioc_check_iocgs(ioc, &now); /* * Wait and indebt stat are flushed above and the donation calculation -- cgit v1.2.3 From 926f75f6a9ef503d45dced061e304d0324beeba1 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 26 Nov 2020 16:16:15 +0800 Subject: blk-iocost: Factor out the base vrate change into a separate function Factor out the base vrate change code into a separate function to fimplify the ioc_timer_fn(). No functional change. Signed-off-by: Baolin Wang Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-iocost.c | 99 +++++++++++++++++++++++++++++------------------------- 1 file changed, 54 insertions(+), 45 deletions(-) (limited to 'block') diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 7dd1424d5833..ffa418c0dcb1 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -971,6 +971,58 @@ done: ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); } +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { @@ -2323,51 +2375,8 @@ static void ioc_timer_fn(struct timer_list *timer) ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = ioc->vtime_base_rate; - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages); - - ioc->vtime_base_rate = vrate; - ioc_refresh_margins(ioc); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); -- cgit v1.2.3 From f6f371f7db42917c7b2a861c4fc923cb352ce5a1 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sun, 6 Dec 2020 14:04:39 +0000 Subject: blk-mq: skip hybrid polling if iopoll doesn't spin If blk_poll() is not going to spin (i.e. @spin=false), it also must not sleep in hybrid polling, otherwise it might be pretty suprising for users trying to do a quick check and expecting no-wait behaviour. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- block/blk-mq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index cf3916e2852f..2881a457de83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3865,9 +3865,10 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) * the state. Like for the other success return cases, the * caller is responsible for checking if the IO completed. If * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. + * straight to the busy poll loop. If specified not to spin, + * we also should not sleep. */ - if (blk_mq_poll_hybrid(q, hctx, cookie)) + if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) return 1; hctx->poll_considered++; -- cgit v1.2.3 From 2afdeb23e4750acb4ff16fd86f566c9074708691 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Wed, 11 Nov 2020 16:36:06 +0900 Subject: block: Improve blk_revalidate_disk_zones() checks Improves the checks on the zones of a zoned block device done in blk_revalidate_disk_zones() by making sure that the device report_zones method did report at least one zone and that the zones reported exactly cover the entire disk capacity, that is, that there are no missing zones at the end of the disk sector range. Signed-off-by: Damien Le Moal Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-zoned.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 6817a673e5ce..7a68b6e4300c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -508,15 +508,29 @@ int blk_revalidate_disk_zones(struct gendisk *disk, noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } memalloc_noio_restore(noio_flag); + /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != get_capacity(disk)) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret >= 0) { + if (ret > 0) { blk_queue_chunk_sectors(q, args.zone_sectors); q->nr_zones = args.nr_zones; swap(q->seq_zones_wlock, args.seq_zones_wlock); -- cgit v1.2.3 From cc29e1bf0d63f728a5bd60ef22638bbf77369552 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Thu, 26 Nov 2020 17:18:52 +0800 Subject: block: disable iopoll for split bio iopoll is initially for small size, latency sensitive IO. It doesn't work well for big IO, especially when it needs to be split to multiple bios. In this case, the returned cookie of __submit_bio_noacct_mq() is indeed the cookie of the last split bio. The completion of *this* last split bio done by iopoll doesn't mean the whole original bio has completed. Callers of iopoll still need to wait for completion of other split bios. Besides bio splitting may cause more trouble for iopoll which isn't supposed to be used in case of big IO. iopoll for split bio may cause potential race if CPU migration happens during bio submission. Since the returned cookie is that of the last split bio, polling on the corresponding hardware queue doesn't help complete other split bios, if these split bios are enqueued into different hardware queues. Since interrupts are disabled for polling queues, the completion of these other split bios depends on timeout mechanism, thus causing a potential hang. iopoll for split bio may also cause hang for sync polling. Currently both the blkdev and iomap-based fs (ext4/xfs, etc) support sync polling in direct IO routine. These routines will submit bio without REQ_NOWAIT flag set, and then start sync polling in current process context. The process may hang in blk_mq_get_tag() if the submitted bio has to be split into multiple bios and can rapidly exhaust the queue depth. The process are waiting for the completion of the previously allocated requests, which should be reaped by the following polling, and thus causing a deadlock. To avoid these subtle trouble described above, just disable iopoll for split bio and return BLK_QC_T_NONE in this case. The side effect is that non-HIPRI IO also returns BLK_QC_T_NONE now. It should be acceptable since the returned cookie is never used for non-HIPRI IO. Suggested-by: Ming Lei Signed-off-by: Jeffle Xu Reviewed-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-merge.c | 8 ++++++++ block/blk-mq.c | 5 +++++ 2 files changed, 13 insertions(+) (limited to 'block') diff --git a/block/blk-merge.c b/block/blk-merge.c index 7497d86fff38..c3399bf29e9c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -279,6 +279,14 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, return NULL; split: *segs = nsegs; + + /* + * Bio splitting may cause subtle trouble such as hang when doing sync + * iopoll in direct IO routine. Given performance gain of iopoll for + * big IO can be trival, disable iopoll when split needed. + */ + bio->bi_opf &= ~REQ_HIPRI; + return bio_split(bio, sectors, GFP_NOIO, bs); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 2881a457de83..95ecc4c69969 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2159,6 +2159,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs; blk_qc_t cookie; blk_status_t ret; + bool hipri; blk_queue_bounce(q, &bio); __blk_queue_split(&bio, &nr_segs); @@ -2175,6 +2176,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) rq_qos_throttle(q, bio); + hipri = bio->bi_opf & REQ_HIPRI; + data.cmd_flags = bio->bi_opf; rq = __blk_mq_alloc_request(&data); if (unlikely(!rq)) { @@ -2267,6 +2270,8 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) blk_mq_sched_insert_request(rq, false, true, true); } + if (!hipri) + return BLK_QC_T_NONE; return cookie; queue_exit: blk_queue_exit(q); -- cgit v1.2.3 From fb01a2932e81a1fb2273f87ff92dc8172b8880ee Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:36 +0800 Subject: blk-mq: add new API of blk_mq_hctx_set_fq_lock_class flush_end_io() may be called recursively from some driver, such as nvme-loop, so lockdep may complain 'possible recursive locking'. Commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing") tried to address this issue by assigning dynamically allocated per-flush-queue lock class. This solution adds synchronize_rcu() for each hctx's release handler, and causes horrible SCSI MQ probe delay(more than half an hour on megaraid sas). Add new API of blk_mq_hctx_set_fq_lock_class() for these drivers, so we just need to use driver specific lock class for avoiding the lockdep warning of 'possible recursive locking'. Tested-by: Kashyap Desai Reported-by: Qian Cai Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 25 +++++++++++++++++++++++++ include/linux/blk-mq.h | 3 +++ 2 files changed, 28 insertions(+) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index 9507dcdd5881..bf51588762d8 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -490,3 +490,28 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) kfree(fq->flush_rq); kfree(fq); } + +/* + * Allow driver to set its own lock class to fq->mq_flush_lock for + * avoiding lockdep complaint. + * + * flush_end_io() may be called recursively from some driver, such as + * nvme-loop, so lockdep may complain 'possible recursive locking' because + * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class + * key. We need to assign different lock class for these driver's + * fq->mq_flush_lock for avoiding the lockdep warning. + * + * Use dynamically allocated lock class key for each 'blk_flush_queue' + * instance is over-kill, and more worse it introduces horrible boot delay + * issue because synchronize_rcu() is implied in lockdep_unregister_key which + * is called for each hctx release. SCSI probing may synchronously create and + * destroy lots of MQ request_queues for non-existent devices, and some robot + * test kernel always enable lockdep option. It is observed that more than half + * an hour is taken during SCSI MQ probe with per-fq lock class. + */ +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key) +{ + lockdep_set_class(&hctx->fq->mq_flush_lock, key); +} +EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 794b2a33a2c3..5f639240760e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -5,6 +5,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -594,5 +595,7 @@ static inline void blk_mq_cleanup_rq(struct request *rq) } blk_qc_t blk_mq_submit_bio(struct bio *bio); +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key); #endif -- cgit v1.2.3 From 7aa390ec2d9db0cd6677d95d0b8f307f9c086770 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 3 Dec 2020 09:26:38 +0800 Subject: Revert "block: Fix a lockdep complaint triggered by request queue flushing" This reverts commit b3c6a59975415bde29cfd76ff1ab008edbf614a9. Now we can avoid nvme-loop lockdep warning of 'lockdep possible recursive locking' by nvme-loop's lock class, no need to apply dynamically allocated lock class key, so revert commit b3c6a5997541("block: Fix a lockdep complaint triggered by request queue flushing"). This way fixes horrible SCSI probe delay issue on megaraid_sas, and it is reported the whole probe may take more than half an hour. Tested-by: Kashyap Desai Reported-by: Qian Cai Reviewed-by: Christoph Hellwig Cc: Sumit Saxena Cc: John Garry Cc: Kashyap Desai Cc: Bart Van Assche Cc: Hannes Reinecke Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-flush.c | 5 ----- block/blk.h | 1 - 2 files changed, 6 deletions(-) (limited to 'block') diff --git a/block/blk-flush.c b/block/blk-flush.c index bf51588762d8..996d5d03dade 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,7 +69,6 @@ #include #include #include -#include #include "blk.h" #include "blk-mq.h" @@ -469,9 +468,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[1]); INIT_LIST_HEAD(&fq->flush_data_in_flight); - lockdep_register_key(&fq->key); - lockdep_set_class(&fq->mq_flush_lock, &fq->key); - return fq; fail_rq: @@ -486,7 +482,6 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } diff --git a/block/blk.h b/block/blk.h index 98f0b1ae2641..d23d018fd2cd 100644 --- a/block/blk.h +++ b/block/blk.h @@ -25,7 +25,6 @@ struct blk_flush_queue { struct list_head flush_data_in_flight; struct request *flush_rq; - struct lock_class_key key; spinlock_t mq_flush_lock; }; -- cgit v1.2.3 From 91cdf265b74bf63a69949d6db08a60523207400c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:53 +0900 Subject: blk-mq: add helper allocating tagset->tags tagset->set is allocated from blk_mq_alloc_tag_set() rather than being reallocated. This patch added a helper to make its meaning explicitly which is to allocate rather than to reallocate. Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index 95ecc4c69969..e2bd9ef81d55 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3382,6 +3382,12 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return 0; } +static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, + int new_nr_hw_queues) +{ + return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); +} + /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the @@ -3435,7 +3441,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) return -ENOMEM; ret = -ENOMEM; -- cgit v1.2.3 From d220a21410e445324b8ae67d93f9c51406f99a29 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:54 +0900 Subject: blk-mq: update arg in comment of blk_mq_map_queue Update mis-named argument description of blk_mq_map_queue(). This patch also updates description that argument to software queue percpu context. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/blk-mq.h b/block/blk-mq.h index c696515766c7..c1458d9502f1 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -99,7 +99,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue * @flags: request command flags - * @cpu: cpu ctx + * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, -- cgit v1.2.3 From fa94ba8a7b22890e6a17b39b9359e114fe18cd59 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sat, 5 Dec 2020 00:20:55 +0900 Subject: blk-mq: fix msec comment from micro to milli seconds Delay to wait for queue running is milli second unit which is passed to delayed work via msecs_to_jiffies() which is to convert milliseconds to jiffies. Signed-off-by: Minwoo Im Reviewed-by: John Garry Signed-off-by: Jens Axboe --- block/blk-mq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/blk-mq.c b/block/blk-mq.c index e2bd9ef81d55..6f207ec9ef83 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1594,7 +1594,7 @@ select_cpu: * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * If !@async, try to run the queue now. Else, run the queue asynchronously and * with a delay of @msecs. @@ -1623,7 +1623,7 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ @@ -1687,7 +1687,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { -- cgit v1.2.3