From f0cd988016f679f489f0ca8498c300eaff370b28 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:07 +0200 Subject: fs: massage locking helpers Multiple people have balked at the the fact that super_lock{_shared,_excluse}() return booleans and even if they return false hold s_umount. So let's change them to only hold s_umount when true is returned and change the code accordingly. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-1-599c19f4faac@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 105 +++++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 61 insertions(+), 44 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 076392396e72..aa43090ea52d 100644 --- a/fs/super.c +++ b/fs/super.c @@ -105,8 +105,9 @@ static inline bool wait_born(struct super_block *sb) * * The caller must have acquired a temporary reference on @sb->s_count. * - * Return: This returns true if SB_BORN was set, false if SB_DYING was - * set. The function acquires s_umount and returns with it held. + * Return: The function returns true if SB_BORN was set and with + * s_umount held. The function returns false if SB_DYING was + * set and without s_umount held. */ static __must_check bool super_lock(struct super_block *sb, bool excl) { @@ -121,8 +122,10 @@ relock: * @sb->s_root is NULL and @sb->s_active is 0. No one needs to * grab a reference to this. Tell them so. */ - if (sb->s_flags & SB_DYING) + if (sb->s_flags & SB_DYING) { + super_unlock(sb, excl); return false; + } /* Has called ->get_tree() successfully. */ if (sb->s_flags & SB_BORN) @@ -140,13 +143,13 @@ relock: goto relock; } -/* wait and acquire read-side of @sb->s_umount */ +/* wait and try to acquire read-side of @sb->s_umount */ static inline bool super_lock_shared(struct super_block *sb) { return super_lock(sb, false); } -/* wait and acquire write-side of @sb->s_umount */ +/* wait and try to acquire write-side of @sb->s_umount */ static inline bool super_lock_excl(struct super_block *sb) { return super_lock(sb, true); @@ -535,16 +538,18 @@ EXPORT_SYMBOL(deactivate_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { - bool born; + bool locked; s->s_count++; spin_unlock(&sb_lock); - born = super_lock_excl(s); - if (born && atomic_inc_not_zero(&s->s_active)) { - put_super(s); - return 1; + locked = super_lock_excl(s); + if (locked) { + if (atomic_inc_not_zero(&s->s_active)) { + put_super(s); + return 1; + } + super_unlock_excl(s); } - super_unlock_excl(s); put_super(s); return 0; } @@ -575,7 +580,6 @@ static inline bool wait_dead(struct super_block *sb) */ static bool grab_super_dead(struct super_block *sb) { - sb->s_count++; if (grab_super(sb)) { put_super(sb); @@ -961,15 +965,17 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -997,15 +1003,17 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); - born = super_lock_shared(sb); - if (born && sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (locked) { + if (sb->s_root) + f(sb, arg); + super_unlock_shared(sb); + } spin_lock(&sb_lock); if (p) @@ -1054,15 +1062,17 @@ struct super_block *user_get_super(dev_t dev, bool excl) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (sb->s_dev == dev) { - bool born; + bool locked; sb->s_count++; spin_unlock(&sb_lock); /* still alive? */ - born = super_lock(sb, excl); - if (born && sb->s_root) - return sb; - super_unlock(sb, excl); + locked = super_lock(sb, excl); + if (locked) { + if (sb->s_root) + return sb; + super_unlock(sb, excl); + } /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -1173,9 +1183,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1186,7 +1196,8 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - super_unlock_excl(sb); + if (locked) + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1209,16 +1220,17 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - bool born = super_lock_excl(sb); + bool locked = super_lock_excl(sb); - if (born && sb->s_root) { + if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - } else { - super_unlock_excl(sb); + return; } + if (locked) + super_unlock_excl(sb); } static void do_thaw_all(struct work_struct *work) @@ -1432,7 +1444,7 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; - bool born; + bool locked; lockdep_assert_held(&bdev->bd_holder_lock); lockdep_assert_not_held(&sb->s_umount); @@ -1444,9 +1456,8 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) spin_unlock(&sb_lock); mutex_unlock(&bdev->bd_holder_lock); - born = super_lock_shared(sb); - if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { - super_unlock_shared(sb); + locked = super_lock_shared(sb); + if (!locked || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { put_super(sb); return NULL; } @@ -1962,9 +1973,11 @@ int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while freezing!"); + return -EINVAL; + } atomic_inc(&sb->s_active); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { @@ -2012,8 +2025,10 @@ retry: /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while freezing!"); + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while freezing!"); + return -EINVAL; + } /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -2131,8 +2146,10 @@ out: */ int thaw_super(struct super_block *sb, enum freeze_holder who) { - if (!super_lock_excl(sb)) - WARN(1, "Dying superblock while thawing!"); + if (!super_lock_excl(sb)) { + WARN_ON_ONCE("Dying superblock while thawing!"); + return -EINVAL; + } return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); -- cgit v1.2.3 From 982c3b3058433f20aba9fb032599cee5dfc17328 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:08 +0200 Subject: bdev: rename freeze and thaw helpers We have bdev_mark_dead() etc and we're going to move block device freezing to holder ops in the next patch. Make the naming consistent: * freeze_bdev() -> bdev_freeze() * thaw_bdev() -> bdev_thaw() Also document the return code. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-2-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 22 +++++++++++++--------- drivers/md/dm.c | 4 ++-- fs/bcachefs/fs-ioctl.c | 4 ++-- fs/ext4/ioctl.c | 4 ++-- fs/f2fs/file.c | 4 ++-- fs/super.c | 4 ++-- fs/xfs/xfs_fsops.c | 4 ++-- include/linux/blkdev.h | 4 ++-- 8 files changed, 27 insertions(+), 23 deletions(-) (limited to 'fs/super.c') diff --git a/block/bdev.c b/block/bdev.c index e4cfb7adb645..a890da1b3adf 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -207,18 +207,20 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) EXPORT_SYMBOL(sync_blockdev_range); /** - * freeze_bdev - lock a filesystem and force it into a consistent state + * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. + * + * Return: On success zero is returned, negative error code on failure. */ -int freeze_bdev(struct block_device *bdev) +int bdev_freeze(struct block_device *bdev) { struct super_block *sb; int error = 0; @@ -248,15 +250,17 @@ done: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(freeze_bdev); +EXPORT_SYMBOL(bdev_freeze); /** - * thaw_bdev - unlock filesystem + * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. */ -int thaw_bdev(struct block_device *bdev) +int bdev_thaw(struct block_device *bdev) { struct super_block *sb; int error = -EINVAL; @@ -285,7 +289,7 @@ out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(thaw_bdev); +EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 23c32cd1f1d8..8dcabf84d866 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2675,7 +2675,7 @@ static int lock_fs(struct mapped_device *md) WARN_ON(test_bit(DMF_FROZEN, &md->flags)); - r = freeze_bdev(md->disk->part0); + r = bdev_freeze(md->disk->part0); if (!r) set_bit(DMF_FROZEN, &md->flags); return r; @@ -2685,7 +2685,7 @@ static void unlock_fs(struct mapped_device *md) { if (!test_bit(DMF_FROZEN, &md->flags)) return; - thaw_bdev(md->disk->part0); + bdev_thaw(md->disk->part0); clear_bit(DMF_FROZEN, &md->flags); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 5a39bcb597a3..45e663342e38 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -291,14 +291,14 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(c->vfs_sb->s_bdev); + ret = bdev_freeze(c->vfs_sb->s_bdev); if (ret) goto err; bch2_journal_flush(&c->journal); c->vfs_sb->s_flags |= SB_RDONLY; bch2_fs_emergency_read_only(c); - thaw_bdev(c->vfs_sb->s_bdev); + bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4f931f80cb34..aa6be510eb8f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e50363583f01..4580dfefd5e9 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2239,11 +2239,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) switch (in) { case F2FS_GOING_DOWN_FULLSYNC: - ret = freeze_bdev(sb->s_bdev); + ret = bdev_freeze(sb->s_bdev); if (ret) goto out; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN); - thaw_bdev(sb->s_bdev); + bdev_thaw(sb->s_bdev); break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ diff --git a/fs/super.c b/fs/super.c index aa43090ea52d..8333aaa878cf 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1224,7 +1224,7 @@ static void do_thaw_all_callback(struct super_block *sb) if (locked && sb->s_root) { if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !thaw_bdev(sb->s_bdev)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); return; @@ -1532,7 +1532,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, /* * Until SB_BORN flag is set, there can be no active superblock * references and thus no filesystem freezing. get_active_super() will - * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. + * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed. * * It is enough to check bdev was not frozen before we set s_bdev. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 7cb75cb6b8e9..57076a25f17d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -482,9 +482,9 @@ xfs_fs_goingdown( { switch (inflags) { case XFS_FSOP_GOING_FLAGS_DEFAULT: { - if (!freeze_bdev(mp->m_super->s_bdev)) { + if (!bdev_freeze(mp->m_super->s_bdev)) { xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT); - thaw_bdev(mp->m_super->s_bdev); + bdev_thaw(mp->m_super->s_bdev); } break; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 51fa7ffdee83..7a3da7f44afb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1541,8 +1541,8 @@ static inline int early_lookup_bdev(const char *pathname, dev_t *dev) } #endif /* CONFIG_BLOCK */ -int freeze_bdev(struct block_device *bdev); -int thaw_bdev(struct block_device *bdev); +int bdev_freeze(struct block_device *bdev); +int bdev_thaw(struct block_device *bdev); struct io_comp_batch { struct request *req_list; -- cgit v1.2.3 From 49ef8832fb1a9e0da0020eb17480fd286433bc13 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 27 Sep 2023 15:21:16 +0200 Subject: bdev: implement freeze and thaw holder operations The old method of implementing block device freeze and thaw operations required us to rely on get_active_super() to walk the list of all superblocks on the system to find any superblock that might use the block device. This is wasteful and not very pleasant overall. Now that we can finally go straight from block device to owning superblock things become way simpler. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-5-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- block/bdev.c | 65 +++++++++++++++---------------- fs/super.c | 99 +++++++++++++++++++++++++++++++++++++---------- include/linux/blk_types.h | 2 +- 3 files changed, 112 insertions(+), 54 deletions(-) (limited to 'fs/super.c') diff --git a/block/bdev.c b/block/bdev.c index f27f1a7db2bd..53a2c653a259 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -222,31 +222,27 @@ EXPORT_SYMBOL(sync_blockdev_range); */ int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); } - bdev->bd_fsfreeze_sb = sb; -sync: - error = sync_blockdev(bdev); -done: + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } @@ -262,29 +258,32 @@ EXPORT_SYMBOL(bdev_freeze); */ int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) goto out; error = 0; - if (--bdev->bd_fsfreeze_count > 0) + if (nr_freeze > 0) goto out; - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; + atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; diff --git a/fs/super.c b/fs/super.c index 8333aaa878cf..8c943ab1f424 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1440,7 +1440,7 @@ EXPORT_SYMBOL(sget_dev); * * The function must be called with bdev->bd_holder_lock and releases it. */ -static struct super_block *bdev_super_lock_shared(struct block_device *bdev) +static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl) __releases(&bdev->bd_holder_lock) { struct super_block *sb = bdev->bd_holder; @@ -1454,18 +1454,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev) spin_lock(&sb_lock); sb->s_count++; spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_holder_lock); - locked = super_lock_shared(sb); - if (!locked || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { - put_super(sb); - return NULL; - } + locked = super_lock(sb, excl); + /* - * The superblock is active and we hold s_umount, we can drop our - * temporary reference now. - */ + * If the superblock wasn't already SB_DYING then we hold + * s_umount and can safely drop our temporary reference. + */ put_super(sb); + + if (!locked) + return NULL; + + if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock(sb, excl); + return NULL; + } + return sb; } @@ -1473,7 +1480,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; @@ -1491,16 +1498,74 @@ static void fs_bdev_sync(struct block_device *bdev) { struct super_block *sb; - sb = bdev_super_lock_shared(bdev); + sb = bdev_super_lock(bdev, false); if (!sb) return; + sync_filesystem(sb); super_unlock_shared(sb); } +static struct super_block *get_bdev_super(struct block_device *bdev) +{ + bool active = false; + struct super_block *sb; + + sb = bdev_super_lock(bdev, true); + if (sb) { + active = atomic_inc_not_zero(&sb->s_active); + super_unlock_excl(sb); + } + if (!active) + return NULL; + return sb; +} + +static int fs_bdev_freeze(struct block_device *bdev) +{ + struct super_block *sb; + int error = 0; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (!sb) + return -EINVAL; + + if (sb->s_op->freeze_super) + error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + else + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + if (!error) + error = sync_blockdev(bdev); + deactivate_super(sb); + return error; +} + +static int fs_bdev_thaw(struct block_device *bdev) +{ + struct super_block *sb; + int error; + + lockdep_assert_held(&bdev->bd_fsfreeze_mutex); + + sb = get_bdev_super(bdev); + if (WARN_ON_ONCE(!sb)) + return -EINVAL; + + if (sb->s_op->thaw_super) + error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + else + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + deactivate_super(sb); + return error; +} + const struct blk_holder_ops fs_holder_ops = { .mark_dead = fs_bdev_mark_dead, .sync = fs_bdev_sync, + .freeze = fs_bdev_freeze, + .thaw = fs_bdev_thaw, }; EXPORT_SYMBOL_GPL(fs_holder_ops); @@ -1530,15 +1595,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, } /* - * Until SB_BORN flag is set, there can be no active superblock - * references and thus no filesystem freezing. get_active_super() will - * just loop waiting for SB_BORN so even bdev_freeze() cannot proceed. - * - * It is enough to check bdev was not frozen before we set s_bdev. + * It is enough to check bdev was not frozen before we set + * s_bdev as freezing will wait until SB_BORN is set. */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (atomic_read(&bdev->bd_fsfreeze_count) > 0) { if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); bdev_release(bdev_handle); @@ -1551,7 +1611,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, if (bdev_stable_writes(bdev)) sb->s_iflags |= SB_I_STABLE_WRITES; spin_unlock(&sb_lock); - mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d5c5e59ddbd2..88e1848b0869 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -57,7 +57,7 @@ struct block_device { const struct blk_holder_ops *bd_holder_ops; struct mutex bd_holder_lock; /* The counter of freeze processes */ - int bd_fsfreeze_count; + atomic_t bd_fsfreeze_count; int bd_holders; struct kobject *bd_holder_dir; -- cgit v1.2.3 From 434f8d8299f2a0c97578f77ab23a70cd0ae56544 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:12 +0200 Subject: fs: remove get_active_super() This function is now unused so remove it. One less function that uses the global superblock list. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-6-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 28 ---------------------------- include/linux/fs.h | 1 - 2 files changed, 29 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 8c943ab1f424..b49a49d70450 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1027,34 +1027,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_active_super - get an active reference to the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. Returns the superblock with an active - * reference or %NULL if none was found. - */ -struct super_block *get_active_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_bdev == bdev) { - if (!grab_super(sb)) - return NULL; - super_unlock_excl(sb); - return sb; - } - } - spin_unlock(&sb_lock); - return NULL; -} - struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; diff --git a/include/linux/fs.h b/include/linux/fs.h index 98b7a7a8c42e..7dc6c1bf5f55 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3121,7 +3121,6 @@ extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); -extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From 97cbed04e71da698d324c585f79f981b032a1bd5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 15:01:14 +0200 Subject: fs: remove unused helper The grab_super() helper is now only used by grab_super_dead(). Merge the two helpers into one. Link: https://lore.kernel.org/r/20231024-vfs-super-freeze-v2-8-599c19f4faac@kernel.org Reviewed-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 54 ++++++++++++++---------------------------------------- 1 file changed, 14 insertions(+), 40 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index b49a49d70450..78bbbb9b3ee9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -523,37 +523,6 @@ void deactivate_super(struct super_block *s) EXPORT_SYMBOL(deactivate_super); -/** - * grab_super - acquire an active reference - * @s: reference we are trying to make active - * - * Tries to acquire an active reference. grab_super() is used when we - * had just found a superblock in super_blocks or fs_type->fs_supers - * and want to turn it into a full-blown active reference. grab_super() - * is called with sb_lock held and drops it. Returns 1 in case of - * success, 0 if we had failed (superblock contents was already dead or - * dying when grab_super() had been called). Note that this is only - * called for superblocks not in rundown mode (== ones still on ->fs_supers - * of their type), so increment of ->s_count is OK here. - */ -static int grab_super(struct super_block *s) __releases(sb_lock) -{ - bool locked; - - s->s_count++; - spin_unlock(&sb_lock); - locked = super_lock_excl(s); - if (locked) { - if (atomic_inc_not_zero(&s->s_active)) { - put_super(s); - return 1; - } - super_unlock_excl(s); - } - put_super(s); - return 0; -} - static inline bool wait_dead(struct super_block *sb) { unsigned int flags; @@ -567,7 +536,7 @@ static inline bool wait_dead(struct super_block *sb) } /** - * grab_super_dead - acquire an active reference to a superblock + * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire * * Acquire a temporary reference on a superblock and try to trade it for @@ -578,16 +547,21 @@ static inline bool wait_dead(struct super_block *sb) * Return: This returns true if an active reference could be acquired, * false if not. */ -static bool grab_super_dead(struct super_block *sb) +static bool grab_super(struct super_block *sb) { + bool locked; + sb->s_count++; - if (grab_super(sb)) { - put_super(sb); - lockdep_assert_held(&sb->s_umount); - return true; + spin_unlock(&sb_lock); + locked = super_lock_excl(sb); + if (locked) { + if (atomic_inc_not_zero(&sb->s_active)) { + put_super(sb); + return true; + } + super_unlock_excl(sb); } wait_var_event(&sb->s_flags, wait_dead(sb)); - lockdep_assert_not_held(&sb->s_umount); put_super(sb); return false; } @@ -838,7 +812,7 @@ share_extant_sb: warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; @@ -882,7 +856,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super_dead(old)) + if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; -- cgit v1.2.3 From 761c47a973441cf2a3602d2a9d0407eb0ec6ce1c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 24 Oct 2023 16:53:39 +0200 Subject: fs: simplify setup_bdev_super() calls There's no need to drop s_umount anymore now that we removed all sources where s_umount is taken beneath open_mutex or bd_holder_lock. Link: https://lore.kernel.org/r/20231024-vfs-super-rework-v1-1-37a8aa697148@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 78bbbb9b3ee9..18b42281ebe2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1601,15 +1601,7 @@ int get_tree_bdev(struct fs_context *fc, return -EBUSY; } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, fc->sb_flags, fc); - __super_lock_excl(s); if (!error) error = fill_super(s, fc); if (error) { @@ -1653,15 +1645,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, return ERR_PTR(-EBUSY); } } else { - /* - * We drop s_umount here because we need to open the bdev and - * bdev->open_mutex ranks above s_umount (blkdev_put() -> - * bdev_mark_dead()). It is safe because we have active sb - * reference and SB_BORN is not set yet. - */ - super_unlock_excl(s); error = setup_bdev_super(s, flags, NULL); - __super_lock_excl(s); if (!error) error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { -- cgit v1.2.3 From 24c372d58223b83d0be5230c1a0e9370b60fb0ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Oct 2023 08:40:01 +0200 Subject: fs: streamline thaw_super_locked Add a new out_unlock label to share code that just releases s_umount and returns an error, and rename and reuse the out label that deactivates the sb for one more case. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231027064001.GA9469@lst.de Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 18b42281ebe2..faeab453e6e9 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2070,34 +2070,28 @@ EXPORT_SYMBOL(freeze_super); */ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { - int error; + int error = -EINVAL; - if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (!(sb->s_writers.freeze_holders & who)) { - super_unlock_excl(sb); - return -EINVAL; - } + if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) + goto out_unlock; + if (!(sb->s_writers.freeze_holders & who)) + goto out_unlock; - /* - * Freeze is shared with someone else. Release our hold and - * drop the active ref that freeze_super assigned to the - * freezer. - */ - if (sb->s_writers.freeze_holders & ~who) { - sb->s_writers.freeze_holders &= ~who; - deactivate_locked_super(sb); - return 0; - } - } else { - super_unlock_excl(sb); - return -EINVAL; + /* + * Freeze is shared with someone else. Release our hold and drop the + * active ref that freeze_super assigned to the freezer. + */ + error = 0; + if (sb->s_writers.freeze_holders & ~who) { + sb->s_writers.freeze_holders &= ~who; + goto out_deactivate; } if (sb_rdonly(sb)) { sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); - goto out; + goto out_deactivate; } lockdep_sb_freeze_acquire(sb); @@ -2107,8 +2101,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); - super_unlock_excl(sb); - return error; + goto out_unlock; } } @@ -2116,9 +2109,13 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); -out: +out_deactivate: deactivate_locked_super(sb); return 0; + +out_unlock: + super_unlock_excl(sb); + return error; } /** -- cgit v1.2.3 From efa5d065b4a0790061921194019c321ad335b8db Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 4 Nov 2023 15:00:12 +0100 Subject: fs: remove dead check Above we call super_lock_excl() which waits until the superblock is SB_BORN and since SB_BORN is never unset once set this check can never fire. Plus, we also hold an active reference at this point already so this superblock can't even be shutdown. Link: https://lore.kernel.org/r/20231104-vfs-multi-device-freeze-v2-1-5b5b69626eac@kernel.org Tested-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index faeab453e6e9..d2b026566dea 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1996,11 +1996,6 @@ retry: goto retry; } - if (!(sb->s_flags & SB_BORN)) { - super_unlock_excl(sb); - return 0; /* sic - it's "nothing to do" */ - } - if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.freeze_holders |= who; -- cgit v1.2.3 From 7366f8b6fc6aa21c4199cb5d337b023df69745b0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 4 Nov 2023 15:00:13 +0100 Subject: fs: handle freezing from multiple devices Before [1] freezing a filesystems through the block layer only worked for the main block device as the owning superblock of additional block devices could not be found. Any filesystem that made use of multiple block devices would only be freezable via it's main block device. For example, consider xfs over device mapper with /dev/dm-0 as main block device and /dev/dm-1 as external log device. Two freeze requests before [1]: (1) dmsetup suspend /dev/dm-0 on the main block device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns 0. (2) dmsetup suspend /dev/dm-1 on the log device bdev_freeze(dm-1) -> dm-1->bd_fsfreeze_count++ The owning superblock isn't found and only the block device freeze count is incremented. Returns 0. Two freeze requests after [1]: (1') dmsetup suspend /dev/dm-0 on the main block device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns 0. (2') dmsetup suspend /dev/dm-1 on the log device bdev_freeze(dm-0) -> dm-0->bd_fsfreeze_count++ -> freeze_super(xfs-sb) The owning superblock is found and the filesystem gets frozen. Returns -EBUSY. When (2') is called we initiate a freeze from another block device of the same superblock. So we increment the bd_fsfreeze_count for that additional block device. But we now also find the owning superblock for additional block devices and call freeze_super() again which reports -EBUSY. This can be reproduced through xfstests via: mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p4 /dev/nvme1n1p3 mkfs.xfs -f -m crc=1,reflink=1,rmapbt=1, -i sparse=1 -lsize=1g,logdev=/dev/nvme1n1p6 /dev/nvme1n1p5 FSTYP=xfs export TEST_DEV=/dev/nvme1n1p3 export TEST_DIR=/mnt/test export TEST_LOGDEV=/dev/nvme1n1p4 export SCRATCH_DEV=/dev/nvme1n1p5 export SCRATCH_MNT=/mnt/scratch export SCRATCH_LOGDEV=/dev/nvme1n1p6 export USE_EXTERNAL=yes sudo ./check generic/311 Current semantics allow two concurrent freezers: one initiated from userspace via FREEZE_HOLDER_USERSPACE and one initiated from the kernel via FREEZE_HOLDER_KERNEL. If there are multiple concurrent freeze requests from either FREEZE_HOLDER_USERSPACE or FREEZE_HOLDER_KERNEL -EBUSY is returned. We need to preserve these semantics because as they are uapi via FIFREEZE and FITHAW ioctl()s. IOW, freezes don't nest for FIFREEZE and FITHAW. Other kernels consumers rely on non-nesting freezes as well. With freezes initiated from the block layer freezes need to nest if the same superblock is frozen via multiple devices. So we need to start counting the number of freeze requests. If FREEZE_MAY_NEST is passed alongside FREEZE_HOLDER_KERNEL or FREEZE_HOLDER_USERSPACE we allow the caller to nest freeze calls. To accommodate the old semantics we split the freeze counter into two counting kernel initiated and userspace initiated freezes separately. We can then also stop recording FREEZE_HOLDER_* in struct sb_writers. We also simplify freezing by making all concurrent freezers share a single active superblock reference count instead of having separate references for kernel and userspace. I don't see why we would need two active reference counts. Neither FREEZE_HOLDER_KERNEL nor FREEZE_HOLDER_USERSPACE can put the active reference as long as they are concurrent freezers anwyay. That was already true before we allowed nesting freezes. Survives various fstests runs with different options including the reproducer, online scrub, and online repair, fsfreze, and so on. Also survives blktests. Link: https://lore.kernel.org/linux-block/87bkccnwxc.fsf@debian-BULLSEYE-live-builder-AMD64 Link: https://lore.kernel.org/r/20231104-vfs-multi-device-freeze-v2-2-5b5b69626eac@kernel.org Fixes: 288d8706abfc ("bdev: implement freeze and thaw holder operations") [1] # no backport needed Tested-by: Chandan Babu R Reviewed-by: Christoph Hellwig Reviewed-by: Jan Kara Reported-by: Chandan Babu R Signed-off-by: Christian Brauner --- fs/super.c | 144 +++++++++++++++++++++++++++++++++++++++++------------ include/linux/fs.h | 18 ++++++- 2 files changed, 129 insertions(+), 33 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index d2b026566dea..aa4e5c11ee32 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1467,6 +1467,21 @@ static struct super_block *get_bdev_super(struct block_device *bdev) return sb; } +/** + * fs_bdev_freeze - freeze owning filesystem of block device + * @bdev: block device + * + * Freeze the filesystem that owns this block device if it is still + * active. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. + */ static int fs_bdev_freeze(struct block_device *bdev) { struct super_block *sb; @@ -1479,15 +1494,34 @@ static int fs_bdev_freeze(struct block_device *bdev) return -EINVAL; if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + error = sb->s_op->freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + error = freeze_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); return error; } +/** + * fs_bdev_thaw - thaw owning filesystem of block device + * @bdev: block device + * + * Thaw the filesystem that owns this block device. + * + * A filesystem that owns multiple block devices may be frozen from each + * block device and won't be unfrozen until all block devices are + * unfrozen. Each block device can only freeze the filesystem once as we + * nest freezes for block devices in the block layer. + * + * Return: If the thaw was successful zero is returned. If the thaw + * failed a negative error code is returned. If this function + * returns zero it doesn't mean that the filesystem is unfrozen + * as it may have been frozen multiple times (kernel may hold a + * freeze or might be frozen from other block devices). + */ static int fs_bdev_thaw(struct block_device *bdev) { struct super_block *sb; @@ -1500,9 +1534,11 @@ static int fs_bdev_thaw(struct block_device *bdev) return -EINVAL; if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + error = sb->s_op->thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + error = thaw_super(sb, + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); deactivate_super(sb); return error; } @@ -1914,6 +1950,47 @@ static int wait_for_partially_frozen(struct super_block *sb) return ret; } +#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) +#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) + +static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + ++sb->s_writers.freeze_kcount; + if (who & FREEZE_HOLDER_USERSPACE) + ++sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount) + --sb->s_writers.freeze_kcount; + if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount) + --sb->s_writers.freeze_ucount; + return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; +} + +static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) +{ + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_HOLDER_KERNEL) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_kcount == 0; + if (who & FREEZE_HOLDER_USERSPACE) + return (who & FREEZE_MAY_NEST) || + sb->s_writers.freeze_ucount == 0; + return false; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock @@ -1926,6 +2003,7 @@ static int wait_for_partially_frozen(struct super_block *sb) * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed. * * The @who argument distinguishes between the kernel and userspace trying to * freeze the filesystem. Although there cannot be multiple kernel freezes or @@ -1933,6 +2011,13 @@ static int wait_for_partially_frozen(struct super_block *sb) * userspace can both hold a filesystem frozen. The filesystem remains frozen * until there are no kernel or userspace freezes in effect. * + * A filesystem may hold multiple devices and thus a filesystems may be + * frozen through the block layer via multiple block devices. In this + * case the request is marked as being allowed to nest by passing + * FREEZE_MAY_NEST. The filesystem remains frozen until all block + * devices are unfrozen. If multiple freezes are attempted without + * FREEZE_MAY_NEST -EBUSY will be returned. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1957,6 +2042,9 @@ static int wait_for_partially_frozen(struct super_block *sb) * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. + * + * Return: If the freeze was successful zero is returned. If the freeze + * failed a negative error code is returned. */ int freeze_super(struct super_block *sb, enum freeze_holder who) { @@ -1970,20 +2058,13 @@ int freeze_super(struct super_block *sb, enum freeze_holder who) retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (sb->s_writers.freeze_holders & who) { - deactivate_locked_super(sb); - return -EBUSY; - } - - WARN_ON(sb->s_writers.freeze_holders == 0); - - /* - * Someone else already holds this type of freeze; share the - * freeze and assign the active ref to the freeze. - */ - sb->s_writers.freeze_holders |= who; - super_unlock_excl(sb); - return 0; + if (may_freeze(sb, who)) + ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); + else + ret = -EBUSY; + /* All freezers share a single active reference. */ + deactivate_locked_super(sb); + return ret; } if (sb->s_writers.frozen != SB_UNFROZEN) { @@ -1998,7 +2079,7 @@ retry: if (sb_rdonly(sb)) { /* Nothing to do really... */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); @@ -2048,7 +2129,7 @@ retry: * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ - sb->s_writers.freeze_holders |= who; + WARN_ON_ONCE(freeze_inc(sb, who) > 1); sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); @@ -2069,21 +2150,15 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; - if (!(sb->s_writers.freeze_holders & who)) - goto out_unlock; /* - * Freeze is shared with someone else. Release our hold and drop the - * active ref that freeze_super assigned to the freezer. + * All freezers share a single active reference. + * So just unlock in case there are any left. */ - error = 0; - if (sb->s_writers.freeze_holders & ~who) { - sb->s_writers.freeze_holders &= ~who; - goto out_deactivate; - } + if (freeze_dec(sb, who)) + goto out_unlock; if (sb_rdonly(sb)) { - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; @@ -2094,13 +2169,13 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR "VFS:Filesystem thaw failed\n"); + pr_err("VFS: Filesystem thaw failed\n"); + freeze_inc(sb, who); lockdep_sb_freeze_release(sb); goto out_unlock; } } - sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); @@ -2124,6 +2199,11 @@ out_unlock: * @who should be: * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. + * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed + * + * A filesystem may hold multiple devices and thus a filesystems may + * have been frozen through the block layer via multiple block devices. + * The filesystem remains frozen until all block devices are unfrozen. */ int thaw_super(struct super_block *sb, enum freeze_holder who) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 7dc6c1bf5f55..b2a3f1c61c19 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1185,7 +1185,8 @@ enum { struct sb_writers { unsigned short frozen; /* Is sb frozen? */ - unsigned short freeze_holders; /* Who froze fs? */ + int freeze_kcount; /* How many kernel freeze requests? */ + int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; @@ -2051,9 +2052,24 @@ extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); +/** + * enum freeze_holder - holder of the freeze + * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem + * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem + * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed + * + * Indicate who the owner of the freeze or thaw request is and whether + * the freeze needs to be exclusive or can nest. + * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the + * same holder aren't allowed. It is however allowed to hold a single + * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at + * the same time. This is relied upon by some filesystems during online + * repair or similar. + */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), + FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { -- cgit v1.2.3 From b30850c58b5b9b7008eb6fc4a65bfb003a0714a7 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Nov 2023 12:51:30 +0100 Subject: super: massage wait event mechanism We're currently using two separate helpers wait_born() and wait_dead() when we can just all do it in a single helper super_load_flags(). We're also acquiring the lock before we check whether this superblock is even a viable candidate. If it's already dying we don't even need to bother with the lock. Link: https://lore.kernel.org/r/20231127-vfs-super-massage-wait-v1-1-9ab277bfd01a@kernel.org Signed-off-by: Christian Brauner --- fs/super.c | 51 ++++++++++++++------------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index aa4e5c11ee32..9cfe9c379c3f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -81,16 +81,13 @@ static inline void super_unlock_shared(struct super_block *sb) super_unlock(sb, false); } -static inline bool wait_born(struct super_block *sb) +static bool super_flags(const struct super_block *sb, unsigned int flags) { - unsigned int flags; - /* * Pairs with smp_store_release() in super_wake() and ensures - * that we see SB_BORN or SB_DYING after we're woken. + * that we see @flags after we're woken. */ - flags = smp_load_acquire(&sb->s_flags); - return flags & (SB_BORN | SB_DYING); + return smp_load_acquire(&sb->s_flags) & flags; } /** @@ -111,10 +108,15 @@ static inline bool wait_born(struct super_block *sb) */ static __must_check bool super_lock(struct super_block *sb, bool excl) { - lockdep_assert_not_held(&sb->s_umount); -relock: + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING)); + + /* Don't pointlessly acquire s_umount. */ + if (super_flags(sb, SB_DYING)) + return false; + __super_lock(sb, excl); /* @@ -127,20 +129,8 @@ relock: return false; } - /* Has called ->get_tree() successfully. */ - if (sb->s_flags & SB_BORN) - return true; - - super_unlock(sb, excl); - - /* wait until the superblock is ready or dying */ - wait_var_event(&sb->s_flags, wait_born(sb)); - - /* - * Neither SB_BORN nor SB_DYING are ever unset so we never loop. - * Just reacquire @sb->s_umount for the caller. - */ - goto relock; + WARN_ON_ONCE(!(sb->s_flags & SB_BORN)); + return true; } /* wait and try to acquire read-side of @sb->s_umount */ @@ -523,18 +513,6 @@ void deactivate_super(struct super_block *s) EXPORT_SYMBOL(deactivate_super); -static inline bool wait_dead(struct super_block *sb) -{ - unsigned int flags; - - /* - * Pairs with memory barrier in super_wake() and ensures - * that we see SB_DEAD after we're woken. - */ - flags = smp_load_acquire(&sb->s_flags); - return flags & SB_DEAD; -} - /** * grab_super - acquire an active reference to a superblock * @sb: superblock to acquire @@ -561,7 +539,7 @@ static bool grab_super(struct super_block *sb) } super_unlock_excl(sb); } - wait_var_event(&sb->s_flags, wait_dead(sb)); + wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD)); put_super(sb); return false; } @@ -908,8 +886,7 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - /* Pairs with memory marrier in super_wake(). */ - if (smp_load_acquire(&sb->s_flags) & SB_DYING) + if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); -- cgit v1.2.3 From 63513f8574c5ef481cd6a85b8c71f6f40f7d7957 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Nov 2023 12:51:31 +0100 Subject: super: don't bother with WARN_ON_ONCE() We hold our own active reference and we've checked it above. Link: https://lore.kernel.org/r/20231127-vfs-super-massage-wait-v1-2-9ab277bfd01a@kernel.org Signed-off-by: Christian Brauner --- fs/super.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/super.c') diff --git a/fs/super.c b/fs/super.c index 9cfe9c379c3f..1745cf7ea005 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2067,10 +2067,7 @@ retry: /* Release s_umount to preserve sb_start_write -> s_umount ordering */ super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - if (!super_lock_excl(sb)) { - WARN_ON_ONCE("Dying superblock while freezing!"); - return -EINVAL; - } + __super_lock_excl(sb); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; -- cgit v1.2.3