From 43bbddc067883d94de7a43d5756a295439fbe37d Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:57 +0800 Subject: ext4: add two helper functions extent_logical_end() and pa_logical_end() When we use lstart + len to calculate the end of free extent or prealloc space, it may exceed the maximum value of 4294967295(0xffffffff) supported by ext4_lblk_t and cause overflow, which may lead to various problems. Therefore, we add two helper functions, extent_logical_end() and pa_logical_end(), to limit the type of end to loff_t, and also convert lstart to loff_t for calculation to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-2-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 +++------ fs/ext4/mballoc.h | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 21b903fe546e..4cb13b3e41b3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4432,7 +4432,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* first, let's learn actual file size * given current request is allocated */ - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); @@ -4766,7 +4766,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; - loff_t tmp_pa_end; struct rb_node *iter; ext4_fsblk_t goal_block; @@ -4862,9 +4861,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ - tmp_pa_end = (loff_t)tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - if (ac->ac_o_ex.fe_logical >= tmp_pa_end) { + if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } @@ -5769,7 +5766,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; - size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len); + size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index df6b5e7c2274..d7aeb5da7d86 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -233,6 +233,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } +static inline loff_t extent_logical_end(struct ext4_sb_info *sbi, + struct ext4_free_extent *fex) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len); +} + +static inline loff_t pa_logical_end(struct ext4_sb_info *sbi, + struct ext4_prealloc_space *pa) +{ + /* Use loff_t to avoid end exceeding ext4_lblk_t max. */ + return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); +} + typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, -- cgit v1.2.3 From bc056e7163ac7db945366de219745cf94f32a3e6 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:58 +0800 Subject: ext4: fix BUG in ext4_mb_new_inode_pa() due to overflow When we calculate the end position of ext4_free_extent, this position may be exactly where ext4_lblk_t (i.e. uint) overflows. For example, if ac_g_ex.fe_logical is 4294965248 and ac_orig_goal_len is 2048, then the computed end is 0x100000000, which is 0. If ac->ac_o_ex.fe_logical is not the first case of adjusting the best extent, that is, new_bex_end > 0, the following BUG_ON will be triggered: ========================================================= kernel BUG at fs/ext4/mballoc.c:5116! invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 3 PID: 673 Comm: xfs_io Tainted: G E 6.5.0-rc1+ #279 RIP: 0010:ext4_mb_new_inode_pa+0xc5/0x430 Call Trace: ext4_mb_use_best_found+0x203/0x2f0 ext4_mb_try_best_found+0x163/0x240 ext4_mb_regular_allocator+0x158/0x1550 ext4_mb_new_blocks+0x86a/0xe10 ext4_ext_map_blocks+0xb0c/0x13a0 ext4_map_blocks+0x2cd/0x8f0 ext4_iomap_begin+0x27b/0x400 iomap_iter+0x222/0x3d0 __iomap_dio_rw+0x243/0xcb0 iomap_dio_rw+0x16/0x80 ========================================================= A simple reproducer demonstrating the problem: mkfs.ext4 -F /dev/sda -b 4096 100M mount /dev/sda /tmp/test fallocate -l1M /tmp/test/tmp fallocate -l10M /tmp/test/file fallocate -i -o 1M -l16777203M /tmp/test/file fsstress -d /tmp/test -l 0 -n 100000 -p 8 & sleep 10 && killall -9 fsstress rm -f /tmp/test/tmp xfs_io -c "open -ad /tmp/test/file" -c "pwrite -S 0xff 0 8192" We simply refactor the logic for adjusting the best extent by adding a temporary ext4_free_extent ex and use extent_logical_end() to avoid overflow, which also simplifies the code. Cc: stable@kernel.org # 6.4 Fixes: 93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()") Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-3-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4cb13b3e41b3..86bce870dc5a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5177,8 +5177,11 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { - int new_bex_start; - int new_bex_end; + struct ext4_free_extent ex = { + .fe_logical = ac->ac_g_ex.fe_logical, + .fe_len = ac->ac_orig_goal_len, + }; + loff_t orig_goal_end = extent_logical_end(sbi, &ex); /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -5197,29 +5200,23 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) * still cover original start * 3. Else, keep the best ex at start of original request. */ - new_bex_end = ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len); - new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical >= new_bex_start) - goto adjust_bex; + ex.fe_len = ac->ac_b_ex.fe_len; - new_bex_start = ac->ac_g_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (ac->ac_o_ex.fe_logical < new_bex_end) + ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); + if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; - new_bex_start = ac->ac_o_ex.fe_logical; - new_bex_end = - new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + ex.fe_logical = ac->ac_g_ex.fe_logical; + if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex)) + goto adjust_bex; + ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: - ac->ac_b_ex.fe_logical = new_bex_start; + ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); - BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + - EXT4_C2B(sbi, ac->ac_orig_goal_len))); + BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; -- cgit v1.2.3 From bedc5d34632c21b5adb8ca7143d4c1f794507e4c Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 24 Jul 2023 20:10:59 +0800 Subject: ext4: avoid overlapping preallocations due to overflow Let's say we want to allocate 2 blocks starting from 4294966386, after predicting the file size, start is aligned to 4294965248, len is changed to 2048, then end = start + size = 0x100000000. Since end is of type ext4_lblk_t, i.e. uint, end is truncated to 0. This causes (pa->pa_lstart >= end) to always hold when checking if the current extent to be allocated crosses already preallocated blocks, so the resulting ac_g_ex may cross already preallocated blocks. Hence we convert the end type to loff_t and use pa_logical_end() to avoid overflow. Signed-off-by: Baokun Li Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230724121059.11834-4-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 86bce870dc5a..78a4a24e2f57 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4222,12 +4222,13 @@ ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_ static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t start, ext4_lblk_t end) + ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start; + loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); @@ -4236,7 +4237,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) @@ -4258,14 +4259,14 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; - ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; + ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; + loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; @@ -4284,7 +4285,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); @@ -4364,8 +4365,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, } if (left_pa) { - left_pa_end = - left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } @@ -4404,8 +4404,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; - ext4_lblk_t end; - loff_t size, start_off; + loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; -- cgit v1.2.3 From 98175720c9ed3bac857b0364321517cc2d695a3f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:47 +0200 Subject: ext4: remove pointless sb_rdonly() checks from freezing code ext4_freeze() and ext4_unfreeze() checks for sb_rdonly(). However this check is pointless as VFS already checks for read-only filesystem before calling filesystem specific methods. Remove the pointless checks. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..ffc4fb73f133 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6347,12 +6347,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) static int ext4_freeze(struct super_block *sb) { int error = 0; - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; + journal_t *journal = EXT4_SB(sb)->s_journal; if (journal) { /* Now we set up the journal barrier. */ @@ -6386,7 +6381,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (sb_rdonly(sb) || ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(EXT4_SB(sb))) return 0; if (EXT4_SB(sb)->s_journal) { -- cgit v1.2.3 From d5d020b3294b69eaf3b8985e7a37ba237849c390 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:48 +0200 Subject: ext4: use sb_rdonly() helper for checking read-only flag sb_rdonly() helper instead of directly checking sb->s_flags. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ffc4fb73f133..19514f98e2fe 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6084,7 +6084,7 @@ static void ext4_update_super(struct super_block *sb) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & SB_RDONLY)) + if (!sb_rdonly(sb)) ext4_update_tstamp(es, s_wtime); es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written + @@ -6675,7 +6675,7 @@ restore_opts: * If there was a failing r/w to ro transition, we may need to * re-enable quota */ - if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) && + if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); sb->s_flags = old_sb_flags; -- cgit v1.2.3 From eb8ab4443aec5ffe923a471b337568a8158cd32b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:49 +0200 Subject: ext4: make ext4_forced_shutdown() take struct super_block Currently ext4_forced_shutdown() takes struct ext4_sb_info but most callers need to get it from struct super_block anyway. So just pass in struct super_block to save all callers from some boilerplate code. No functional changes. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 ++-- fs/ext4/ext4_jbd2.c | 2 +- fs/ext4/file.c | 13 ++++++------- fs/ext4/fsync.c | 2 +- fs/ext4/ialloc.c | 2 +- fs/ext4/inline.c | 2 +- fs/ext4/inode.c | 24 ++++++++++++------------ fs/ext4/ioctl.c | 2 +- fs/ext4/namei.c | 8 ++++---- fs/ext4/page-io.c | 2 +- fs/ext4/super.c | 14 +++++++------- fs/ext4/xattr.c | 2 +- 12 files changed, 38 insertions(+), 39 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..feb38c9fe129 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2222,9 +2222,9 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); #define EXT4_FLAGS_SHUTDOWN 1 #define EXT4_FLAGS_BDEV_IS_DAX 2 -static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) +static inline int ext4_forced_shutdown(struct super_block *sb) { - return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); + return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } /* diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..b72a22a57d20 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -67,7 +67,7 @@ static int ext4_journal_check_start(struct super_block *sb) might_sleep(); - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (sb_rdonly(sb)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..2071b1e4322c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -131,7 +131,7 @@ static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) @@ -153,7 +153,7 @@ static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, { struct inode *inode = file_inode(in); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } @@ -709,7 +709,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; #ifdef CONFIG_FS_DAX @@ -807,10 +807,9 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - struct dax_device *dax_dev = sbi->s_daxdev; + struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; /* @@ -886,7 +885,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0c56f3a011a1..bffc1d0994f5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -133,7 +133,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; ASSERT(ext4_journal_current_handle() == NULL); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..060630c0b0ca 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -950,7 +950,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return ERR_PTR(-EIO); ngroups = ext4_get_groups_count(sb); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..3623dfcc8fc7 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -228,7 +228,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..c6fa59e57f1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -2213,7 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || + if (ext4_forced_shutdown(sb) || ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* @@ -2540,7 +2540,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; @@ -2759,7 +2759,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2798,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2857,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -5135,7 +5135,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) sb_rdonly(inode->i_sb)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5255,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5676,7 +5676,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5702,7 +5702,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..0d3aef1118cb 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -801,7 +801,7 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags) if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; - if (ext4_forced_shutdown(sbi)) + if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..6298cfaaa0bd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3142,7 +3142,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; /* Initialize quotas before so that eventual writes go in @@ -3301,7 +3301,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; trace_ext4_unlink_enter(dir, dentry); @@ -3369,7 +3369,7 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(dir->i_sb))) return -EIO; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, @@ -4189,7 +4189,7 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(old_dir->i_sb)))) + if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 3621f29ec671..dfdd7e5cf038 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -184,7 +184,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) io_end->handle = NULL; /* Following call will use up the handle */ ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { + if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { ext4_msg(inode->i_sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 19514f98e2fe..0038233eafa8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -758,7 +758,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -783,7 +783,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -818,7 +818,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -898,7 +898,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -992,7 +992,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return; trace_ext4_error(sb, function, line); @@ -6298,7 +6298,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sbi))) + if (unlikely(ext4_forced_shutdown(sb))) return 0; trace_ext4_sync_fs(sb, wait); @@ -6381,7 +6381,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(EXT4_SB(sb))) + if (ext4_forced_shutdown(sb)) return 0; if (EXT4_SB(sb)->s_journal) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 05151d61b00b..7cc502c06246 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -701,7 +701,7 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, { int error; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) -- cgit v1.2.3 From 22b8d707b07e6e06f50fe1d9ca8756e1f894eb0d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:50 +0200 Subject: ext4: make 'abort' mount option handling standard 'abort' mount option is the only mount option that has special handling and sets a bit in sbi->s_mount_flags. There is not strong reason for that so just simplify the code and make 'abort' set a bit in sbi->s_mount_opt2 as any other mount option. This simplifies the code and will allow us to drop EXT4_MF_FS_ABORTED completely in the following patch. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 16 ++-------------- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index feb38c9fe129..907829007f3f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1235,6 +1235,7 @@ struct ext4_inode_info { #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ +#define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0038233eafa8..f84142907cd5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1897,6 +1897,7 @@ static const struct mount_opts { {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, #endif + {Opt_abort, EXT4_MOUNT2_ABORT, MOPT_SET | MOPT_2}, {Opt_err, 0, 0} }; @@ -1965,8 +1966,6 @@ struct ext4_fs_context { unsigned int mask_s_mount_opt; unsigned int vals_s_mount_opt2; unsigned int mask_s_mount_opt2; - unsigned long vals_s_mount_flags; - unsigned long mask_s_mount_flags; unsigned int opt_flags; /* MOPT flags */ unsigned int spec; u32 s_max_batch_time; @@ -2117,12 +2116,6 @@ EXT4_SET_CTX(mount_opt2); EXT4_CLEAR_CTX(mount_opt2); EXT4_TEST_CTX(mount_opt2); -static inline void ctx_set_mount_flag(struct ext4_fs_context *ctx, int bit) -{ - set_bit(bit, &ctx->mask_s_mount_flags); - set_bit(bit, &ctx->vals_s_mount_flags); -} - static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ext4_fs_context *ctx = fc->fs_private; @@ -2186,9 +2179,6 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ext4_msg(NULL, KERN_WARNING, "Ignoring removed %s option", param->key); return 0; - case Opt_abort: - ctx_set_mount_flag(ctx, EXT4_MF_FS_ABORTED); - return 0; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT ctx_set_flags(ctx, SB_INLINECRYPT); @@ -2842,8 +2832,6 @@ static void ext4_apply_options(struct fs_context *fc, struct super_block *sb) sbi->s_mount_opt |= ctx->vals_s_mount_opt; sbi->s_mount_opt2 &= ~ctx->mask_s_mount_opt2; sbi->s_mount_opt2 |= ctx->vals_s_mount_opt2; - sbi->s_mount_flags &= ~ctx->mask_s_mount_flags; - sbi->s_mount_flags |= ctx->vals_s_mount_flags; sb->s_flags &= ~ctx->mask_s_flags; sb->s_flags |= ctx->vals_s_flags; @@ -6497,7 +6485,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) goto restore_opts; } - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (test_opt2(sb, ABORT)) ext4_abort(sb, ESHUTDOWN, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | -- cgit v1.2.3 From 95257987a6387f02970eda707e55a06cce734e18 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:51 +0200 Subject: ext4: drop EXT4_MF_FS_ABORTED flag EXT4_MF_FS_ABORTED flag has practically the same intent as EXT4_FLAGS_SHUTDOWN flag. The shutdown flag is checked in many more places than the aborted flag which is mostly the historical artifact where we were relying on SB_RDONLY checks instead of the aborted flag checks. There are only three places - ext4_sync_file(), __ext4_remount(), and mballoc debug code - which check aborted flag and not shutdown flag and this is arguably a bug. Avoid these inconsistencies by removing EXT4_MF_FS_ABORTED flag and using EXT4_FLAGS_SHUTDOWN everywhere. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/fsync.c | 7 +++---- fs/ext4/inode.c | 8 +++----- fs/ext4/mballoc.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 907829007f3f..89d76d50af4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index bffc1d0994f5..b40d3b29f7e5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,7 +131,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6fa59e57f1e..100c3ec6da6c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 78a4a24e2f57..1dc63e329e64 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5664,7 +5664,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5698,7 +5698,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f84142907cd5..20a8e64da4ac 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -657,7 +657,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -6502,7 +6502,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_error_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } -- cgit v1.2.3 From e0e985f3f8941438a66ab8abb94cb011b9fb39a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:52 +0200 Subject: ext4: avoid starting transaction on read-only fs in ext4_quota_off() When the filesystem gets first remounted read-only and then unmounted, ext4_quota_off() will try to start a transaction (and fail) on read-only filesystem to cleanup inode flags for legacy quota files. Just bail before trying to start a transaction instead since that is going to issue a warning. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 20a8e64da4ac..a9a7c38c7442 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -7072,6 +7072,13 @@ static int ext4_quota_off(struct super_block *sb, int type) err = dquot_quota_off(sb, type); if (err || ext4_has_feature_quota(sb)) goto out_put; + /* + * When the filesystem was remounted read-only first, we cannot cleanup + * inode flags here. Bad luck but people should be using QUOTA feature + * these days anyway. + */ + if (sb_rdonly(sb)) + goto out_put; inode_lock(inode); /* -- cgit v1.2.3 From e7fc2b31e04c46c9e2098bba710c9951c6b968af Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:53 +0200 Subject: ext4: warn on read-only filesystem in ext4_journal_check_start() Now that filesystem abort marks the filesystem as shutdown, we shouldn't be ever hitting the sb_rdonly() check in ext4_journal_check_start(). Since this is a suitable place for catching all sorts of programming errors, convert the check to WARN_ON instead of dropping it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b72a22a57d20..ca0eaf2147b0 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -70,8 +70,9 @@ static int ext4_journal_check_start(struct super_block *sb) if (unlikely(ext4_forced_shutdown(sb))) return -EIO; - if (sb_rdonly(sb)) + if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; + WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); journal = EXT4_SB(sb)->s_journal; /* -- cgit v1.2.3 From ffb6844e28ef6b9d76bee378774d7afbc3db6da9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:54 +0200 Subject: ext4: drop read-only check in ext4_init_inode_table() We better should not be initializing inode tables on read-only filesystem. The following transaction start will warn us and make the function bail anyway so drop the pointless check. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 060630c0b0ca..e0698f54e17a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1523,12 +1523,6 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int num, ret = 0, used_blks = 0; unsigned long used_inos = 0; - /* This should not happen, but just to be sure check this */ - if (sb_rdonly(sb)) { - ret = 1; - goto out; - } - gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp || !grp) goto out; -- cgit v1.2.3 From f1128084b40e520bea8bb32b3ff4d03745ab7e64 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:55 +0200 Subject: ext4: drop read-only check in ext4_write_inode() We should not have dirty inodes on read-only filesystem. Also silently bailing without writing anything would be a problem when we enable quotas during remount while the filesystem is read-only. So drop the read-only check. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 100c3ec6da6c..1b9003840bc1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5129,8 +5129,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) -- cgit v1.2.3 From 889860e452d7436ca72018b8a03cbd89c38d6384 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:56 +0200 Subject: ext4: drop read-only check from ext4_force_commit() JBD2 code will quickly return without doing anything when there's nothing to commit so there's no point in the read-only check in ext4_force_commit(). Just drop it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-10-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9a7c38c7442..4613264344b0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6270,13 +6270,7 @@ static int ext4_clear_journal_err(struct super_block *sb, */ int ext4_force_commit(struct super_block *sb) { - journal_t *journal; - - if (sb_rdonly(sb)) - return 0; - - journal = EXT4_SB(sb)->s_journal; - return ext4_journal_force_commit(journal); + return ext4_journal_force_commit(EXT4_SB(sb)->s_journal); } static int ext4_sync_fs(struct super_block *sb, int wait) -- cgit v1.2.3 From 1e1566b9c85fbd6150657ea17f50fd42b9166d31 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:57 +0200 Subject: ext4: replace read-only check for shutdown check in mmp code The multi-mount protection kthread checks for read-only filesystem and aborts in that case. The remount code actually handles stopping of the kthread on remount so the only purpose of the check is in case of emergency remount read-only. Replace the check for read-only filesystem with a check for shutdown filesystem as running MMP on such is risky anyway and it makes ordering of things during remount simpler. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-11-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/mmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 0aaf38ffcb6e..bd946d0c71b7 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !sb_rdonly(sb)) { + while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); -- cgit v1.2.3 From 304749c0d5e216479ea4d553ad04ba1390d5c707 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Fri, 30 Jun 2023 14:29:27 +0530 Subject: ext4: replace CR_FAST macro with inline function for readability Replace CR_FAST with ext4_mb_cr_expensive() inline function for better readability. This function returns true if the criteria is one of the expensive/slower ones where lots of disk IO/prefetching is acceptable. No functional changes are intended in this patch. Signed-off-by: Ojaswin Mujoo Reviewed-by: Jan Kara Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230630085927.140137-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 7 ++++--- fs/ext4/mballoc.c | 13 +++++++++---- 2 files changed, 13 insertions(+), 7 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89d76d50af4c..532b70f613e9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -176,9 +176,6 @@ enum criteria { EXT4_MB_NUM_CRS }; -/* criteria below which we use fast block scanning and avoid unnecessary IO */ -#define CR_FAST CR_GOAL_LEN_SLOW - /* * Flags used in mballoc's allocation_context flags field. * @@ -2924,6 +2921,10 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, int state); +static inline bool ext4_mb_cr_expensive(enum criteria cr) +{ + return cr >= CR_GOAL_LEN_SLOW; +} /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1dc63e329e64..a2d791953da5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2450,7 +2450,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, break; } - if (ac->ac_criteria < CR_FAST) { + if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough @@ -2634,7 +2634,12 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) goto out; - if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len) + /* + * In all criterias except CR_ANY_FREE we try to avoid groups that + * can't possibly satisfy the full goal request due to insufficient + * free blocks. + */ + if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; @@ -2658,7 +2663,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ - if (cr < CR_FAST && + if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && @@ -2852,7 +2857,7 @@ repeat: * spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && - (cr >= CR_FAST || + (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { -- cgit v1.2.3 From a9ce5993a0f5c0887c8a1b4ffa3b8046fbcfdc93 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:55 +0800 Subject: ext4: correct grp validation in ext4_mb_good_group Group corruption check will access memory of grp and will trigger kernel crash if grp is NULL. So do NULL check before corruption check. Fixes: 5354b2af3406 ("ext4: allow ext4_get_group_info() to fail") Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a2d791953da5..e07d2a4fbcd1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2553,7 +2553,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); - if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp)) + if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; -- cgit v1.2.3 From 60c672b7f2d1e5dd1774f2399b355c9314e709f8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:56 +0800 Subject: ext4: avoid potential data overflow in next_linear_group ngroups is ext4_group_t (unsigned int) while next_linear_group treat it in int. If ngroups is bigger than max number described by int, it will be treat as a negative number. Then "return group + 1 >= ngroups ? 0 : group + 1;" may keep returning 0. Switch int to ext4_group_t in next_linear_group to fix the overflow. Fixes: 196e402adf2e ("ext4: improve cr 0 / cr 1 group scanning") Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e07d2a4fbcd1..bf041932c599 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1080,8 +1080,9 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac) * Return next linear group for allocation. If linear traversal should not be * performed, this function just returns the same group */ -static int -next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +static ext4_group_t +next_linear_group(struct ext4_allocation_context *ac, ext4_group_t group, + ext4_group_t ngroups) { if (!should_optimize_scan(ac)) goto inc_and_return; -- cgit v1.2.3 From 919eb90cec4049cecf4a9f996afb0f14e3864fca Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:57 +0800 Subject: ext4: return found group directly in ext4_mb_choose_next_group_p2_aligned Return good group when it's found in loop to remove unnecessary NULL initialization of grp and futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index bf041932c599..880b9731edaa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -874,7 +874,7 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_group_info *iter, *grp; + struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) @@ -883,7 +883,6 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); - grp = NULL; for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; @@ -892,28 +891,22 @@ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } - grp = NULL; list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { - grp = iter; - break; + *group = iter->bb_group; + ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); - if (grp) - break; } - if (!grp) { - /* Increment cr and search again */ - *new_cr = CR_GOAL_LEN_FAST; - } else { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; - } + /* Increment cr and search again if no group is found */ + *new_cr = CR_GOAL_LEN_FAST; } /* -- cgit v1.2.3 From bb60caa2db6697c20a0842b5b3c192aa1800da1a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:58 +0800 Subject: ext4: use is_power_of_2 helper in ext4_mb_regular_allocator Use intuitive is_power_of_2 helper in ext4_mb_regular_allocator. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 880b9731edaa..3dd2609ea133 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2799,10 +2799,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { - /* - * This should tell if fe_len is exactly power of 2 - */ - if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) + if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } -- cgit v1.2.3 From ad635507b5b22d59457b6db6d8a0e4ddf7ad2b4c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:31:59 +0800 Subject: ext4: remove unnecessary return for void function The return at end of void function is unnecessary, just remove it. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3dd2609ea133..154ae1fca10c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4976,7 +4976,6 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); n = rb_next(n); } - return; } /* @@ -5727,12 +5726,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) #else static inline void ext4_mb_show_pa(struct super_block *sb) { - return; } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); - return; } #endif @@ -5973,12 +5970,9 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ - if (lg_prealloc_count > 8) { + if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); - return; - } - return ; } /* @@ -6632,7 +6626,6 @@ do_more: error_return: brelse(bitmap_bh); ext4_std_error(sb, err); - return; } /** @@ -6735,7 +6728,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } ext4_mb_clear_bb(handle, inode, block, count, flags); - return; } /** -- cgit v1.2.3 From de8bf0e5ee7482585450357c6d4eddec8efc5cb7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:00 +0800 Subject: ext4: replace the traditional ternary conditional operator with with max()/min() Replace the traditional ternary conditional operator with with max()/min() Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 154ae1fca10c..f0deb5f2f81d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6917,8 +6917,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) void *bitmap; bitmap = e4b->bd_bitmap; - start = (e4b->bd_info->bb_first_free > start) ? - e4b->bd_info->bb_first_free : start; + start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -7135,8 +7134,7 @@ ext4_mballoc_query_range( ext4_lock_group(sb, group); - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + start = max(e4b.bd_info->bb_first_free, start); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; -- cgit v1.2.3 From f6c72fef1272e65eff8d5ecef8c744686f6b7745 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:01 +0800 Subject: ext4: remove unused ext4_{set}/{clear}_bit_atomic Remove ext4_set_bit_atomic and ext4_clear_bit_atomic which are defined but not used. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 532b70f613e9..fb4d914ea888 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1250,10 +1250,8 @@ struct ext4_inode_info { #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le -#define ext4_set_bit_atomic ext2_set_bit_atomic #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le -#define ext4_clear_bit_atomic ext2_clear_bit_atomic #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le -- cgit v1.2.3 From b50675a4a6a69110c0c2baadebd2075d3b31b25c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:02 +0800 Subject: ext4: return found group directly in ext4_mb_choose_next_group_goal_fast Return good group when it's found in loop to remove futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f0deb5f2f81d..3b1f90dfb119 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -959,16 +959,14 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; - } else { - *new_cr = CR_BEST_AVAIL_LEN; - } + *new_cr = CR_BEST_AVAIL_LEN; } /* -- cgit v1.2.3 From bcb123ac9b9887478da4185b55dfbf1a72550848 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:03 +0800 Subject: ext4: return found group directly in ext4_mb_choose_next_group_best_avail Return good group when it's found in loop to remove futher check if good group is found after loop. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3b1f90dfb119..f9189f7566fb 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1042,18 +1042,16 @@ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); - if (grp) - break; + if (grp) { + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; + return; + } } - if (grp) { - *group = grp->bb_group; - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; - } else { - /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; - *new_cr = CR_GOAL_LEN_SLOW; - } + /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ + ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; + *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) -- cgit v1.2.3 From 4eea9fbed950f240bf6e627e1c784b8d54c54988 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 1 Aug 2023 22:32:04 +0800 Subject: ext4: correct some stale comment of criteria We named criteria with CR_XXX, correct stale comment to criteria with raw number. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230801143204.2284343-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f9189f7566fb..b89b5f0816e7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2782,8 +2782,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) /* * ac->ac_2order is set only if the fe_len is a power of 2 - * if ac->ac_2order is set we also set criteria to 0 so that we - * try exact allocation using buddy. + * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED + * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; @@ -2840,8 +2840,8 @@ repeat: /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit - * prefetching at cr=0/1, otherwise mballoc can - * spend a lot of time loading imperfect groups + * prefetching at inexpensive CR, otherwise mballoc + * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && (ext4_mb_cr_expensive(cr) || -- cgit v1.2.3 From 7ca4b085f430f3774c3838b3da569ceccd6a0177 Mon Sep 17 00:00:00 2001 From: Luís Henriques Date: Thu, 3 Aug 2023 10:17:13 +0100 Subject: ext4: fix memory leaks in ext4_fname_{setup_filename,prepare_lookup} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the filename casefolding fails, we'll be leaking memory from the fscrypt_name struct, namely from the 'crypto_buf.name' member. Make sure we free it in the error path on both ext4_fname_setup_filename() and ext4_fname_prepare_lookup() functions. Cc: stable@kernel.org Fixes: 1ae98e295fa2 ("ext4: optimize match for casefolded encrypted dirs") Signed-off-by: Luís Henriques Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20230803091713.13239-1-lhenriques@suse.de Signed-off-by: Theodore Ts'o --- fs/ext4/crypto.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e20ac0654b3f..453d4da5de52 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -33,6 +33,8 @@ int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, iname, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } @@ -51,6 +53,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); + if (err) + ext4_fname_free_filename(fname); #endif return err; } -- cgit v1.2.3 From 772c9f691dcf3a487f29ddb90a5a15c78d7328e1 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 16 Jul 2023 19:33:34 +0530 Subject: ext4: don't use CR_BEST_AVAIL_LEN for non-regular files Using CR_BEST_AVAIL_LEN only make sense for regular files, as for non-regular files we never normalize the allocation request length i.e. goal len is same as original length (ac_g_ex.fe_len == ac_o_ex.fe_len). Hence there is no scope of trimming the goal length to make it satisfy original request len. Thus this patch avoids using CR_BEST_AVAIL_LEN criteria for non-regular files request. Cc: stable@kernel.org Fixes: 33122aa930f1 ("ext4: Add allocation criteria 1.5 (CR1_5)") Reported-by: Eric Whitney Signed-off-by: Ritesh Harjani (IBM) Tested-by: Eric Whitney Link: https://lore.kernel.org/r/2a694c748ff8b8c4b416995a24f06f07b55047a8.1689516047.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b89b5f0816e7..3d5b0b71d7f5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -966,7 +966,18 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context * } } - *new_cr = CR_BEST_AVAIL_LEN; + /* + * CR_BEST_AVAIL_LEN works based on the concept that we have + * a larger normalized goal len request which can be trimmed to + * a smaller goal len such that it can still satisfy original + * request len. However, allocation request for non-regular + * files never gets normalized. + * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). + */ + if (ac->ac_flags & EXT4_MB_HINT_DATA) + *new_cr = CR_BEST_AVAIL_LEN; + else + *new_cr = CR_GOAL_LEN_SLOW; } /* -- cgit v1.2.3 From 8e6cf5fbb7b47d337998faab3fcacdceaa547ead Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:08 +0800 Subject: jbd2: jbd2_journal_init_{dev,inode} return proper error return value Current jbd2_journal_init_{dev,inode} return NULL if some error happens, make them to pass out proper error return value. [ Fix from Yang Yingliang folded in. ] Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-11-yi.zhang@huaweicloud.com Link: https://lore.kernel.org/r/20230822030018.644419-1-yangyingliang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- fs/jbd2/journal.c | 19 +++++++++---------- fs/ocfs2/journal.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4613264344b0..279e37c3b275 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5815,7 +5815,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, return NULL; journal = jbd2_journal_init_inode(journal_inode); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); return NULL; @@ -5894,7 +5894,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, journal = jbd2_journal_init_dev(bdev, sb->s_bdev, start, len, blocksize); - if (!journal) { + if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); goto out_bdev; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 6482fcca3dc6..15e33c26c6cd 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1531,7 +1531,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) - return NULL; + return ERR_PTR(-ENOMEM); journal->j_blocksize = blocksize; journal->j_dev = bdev; @@ -1576,6 +1576,7 @@ static journal_t *journal_init_common(struct block_device *bdev, * journal descriptor can store up to n blocks, we need enough * buffers to write out full descriptor block. */ + err = -ENOMEM; n = journal->j_blocksize / jbd2_min_tag_size(); journal->j_wbufsize = n; journal->j_fc_wbuf = NULL; @@ -1607,7 +1608,7 @@ err_cleanup: jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); kfree(journal); - return NULL; + return ERR_PTR(err); } /* jbd2_journal_init_dev and jbd2_journal_init_inode: @@ -1640,8 +1641,8 @@ journal_t *jbd2_journal_init_dev(struct block_device *bdev, journal_t *journal; journal = journal_init_common(bdev, fs_dev, start, len, blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); snprintf(journal->j_devname, sizeof(journal->j_devname), "%pg", journal->j_dev); @@ -1667,11 +1668,9 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) blocknr = 0; err = bmap(inode, &blocknr); - if (err || !blocknr) { - pr_err("%s: Cannot locate journal superblock\n", - __func__); - return NULL; + pr_err("%s: Cannot locate journal superblock\n", __func__); + return err ? ERR_PTR(err) : ERR_PTR(-EINVAL); } jbd2_debug(1, "JBD2: inode %s/%ld, size %lld, bits %d, blksize %ld\n", @@ -1681,8 +1680,8 @@ journal_t *jbd2_journal_init_inode(struct inode *inode) journal = journal_init_common(inode->i_sb->s_bdev, inode->i_sb->s_bdev, blocknr, inode->i_size >> inode->i_sb->s_blocksize_bits, inode->i_sb->s_blocksize); - if (!journal) - return NULL; + if (IS_ERR(journal)) + return ERR_CAST(journal); journal->j_inode = inode; snprintf(journal->j_devname, sizeof(journal->j_devname), diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 25d8072ccfce..1d2960e8ce74 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -911,9 +911,9 @@ int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) /* call the kernels journal init function now */ j_journal = jbd2_journal_init_inode(inode); - if (j_journal == NULL) { + if (IS_ERR(j_journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EINVAL; + status = PTR_ERR(j_journal); goto done; } @@ -1687,9 +1687,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } journal = jbd2_journal_init_inode(inode); - if (journal == NULL) { + if (IS_ERR(journal)) { mlog(ML_ERROR, "Linux journal layer error\n"); - status = -EIO; + status = PTR_ERR(journal); goto done; } -- cgit v1.2.3 From bc74e6a38d16d745a9bc28a7e343494019066492 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:09 +0800 Subject: ext4: cleanup ext4_get_dev_journal() and ext4_get_journal() Factor out a new helper form ext4_get_dev_journal() to get external journal bdev and check validation of this device, drop ext4_blkdev_get() helper, and also remove duplicate check of journal feature. It makes ext4_get_dev_journal() more clear than before. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-12-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 109 +++++++++++++++++++++++++------------------------------- 1 file changed, 49 insertions(+), 60 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 279e37c3b275..0eee238b290e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1105,26 +1105,6 @@ static const struct blk_holder_ops ext4_holder_ops = { .mark_dead = ext4_bdev_mark_dead, }; -/* - * Open the external journal device - */ -static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); - if (IS_ERR(bdev)) - goto fail; - return bdev; - -fail: - ext4_msg(sb, KERN_ERR, - "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return NULL; -} - /* * Release the journal device */ @@ -5768,14 +5748,14 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, ext4_msg(sb, KERN_ERR, "journal inode is deleted"); return NULL; } - - ext4_debug("Journal inode found at %p: %lld bytes\n", - journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); return NULL; } + + ext4_debug("Journal inode found at %p: %lld bytes\n", + journal_inode, journal_inode->i_size); return journal_inode; } @@ -5807,9 +5787,6 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - journal_inode = ext4_get_journal_inode(sb, journal_inum); if (!journal_inode) return NULL; @@ -5826,25 +5803,25 @@ static journal_t *ext4_get_journal(struct super_block *sb, return journal; } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, + dev_t j_dev, ext4_fsblk_t *j_start, + ext4_fsblk_t *j_len) { struct buffer_head *bh; - journal_t *journal; - ext4_fsblk_t start; - ext4_fsblk_t len; + struct block_device *bdev; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; - struct block_device *bdev; - if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) - return NULL; - - bdev = ext4_blkdev_get(j_dev, sb); - if (bdev == NULL) + bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, + &ext4_holder_ops); + if (IS_ERR(bdev)) { + ext4_msg(sb, KERN_ERR, + "failed to open journal device unknown-block(%u,%u) %ld", + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); return NULL; + } blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); @@ -5857,7 +5834,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, sb_block = EXT4_MIN_BLOCK_SIZE / blocksize; offset = EXT4_MIN_BLOCK_SIZE % blocksize; set_blocksize(bdev, blocksize); - if (!(bh = __bread(bdev, sb_block, blocksize))) { + bh = __bread(bdev, sb_block, blocksize); + if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); goto out_bdev; @@ -5867,56 +5845,67 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) || !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "bad superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + goto out_bh; } if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { - ext4_msg(sb, KERN_ERR, "external journal has " - "corrupt superblock"); - brelse(bh); - goto out_bdev; + ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); - brelse(bh); - goto out_bdev; + goto out_bh; } - len = ext4_blocks_count(es); - start = sb_block + 1; - brelse(bh); /* we're done with the superblock */ + *j_start = sb_block + 1; + *j_len = ext4_blocks_count(es); + brelse(bh); + return bdev; + +out_bh: + brelse(bh); +out_bdev: + blkdev_put(bdev, sb); + return NULL; +} + +static journal_t *ext4_get_dev_journal(struct super_block *sb, + dev_t j_dev) +{ + journal_t *journal; + ext4_fsblk_t j_start; + ext4_fsblk_t j_len; + struct block_device *journal_bdev; + + journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (!journal_bdev) + return NULL; - journal = jbd2_journal_init_dev(bdev, sb->s_bdev, - start, len, blocksize); + journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); goto out_bdev; } - journal->j_private = sb; - if (ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, true)) { - ext4_msg(sb, KERN_ERR, "I/O error on journal device"); - goto out_journal; - } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); goto out_journal; } - EXT4_SB(sb)->s_journal_bdev = bdev; + journal->j_private = sb; + EXT4_SB(sb)->s_journal_bdev = journal_bdev; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(bdev, sb); + blkdev_put(journal_bdev, sb); return NULL; } -- cgit v1.2.3 From ee5c807137ce283acebd83297f8855428cdd839a Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Fri, 11 Aug 2023 14:36:10 +0800 Subject: ext4: ext4_get_{dev}_journal return proper error value ext4_get_journal() and ext4_get_dev_journal() return NULL if they failed to init journal, making them return proper error value instead, also rename them to ext4_open_{inode,dev}_journal(). [ Folded fix to ext4_calculate_overhead() to check for an ERR_PTR instead of NULL. ] Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230811063610.2980059-13-yi.zhang@huaweicloud.com Reported-by: syzbot+b3123e6d9842e526de39@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20230826011029.2023140-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0eee238b290e..6edf7deeb2db 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4200,7 +4200,7 @@ int ext4_calculate_overhead(struct super_block *sb) else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ j_inode = ext4_get_journal_inode(sb, j_inum); - if (j_inode) { + if (!IS_ERR(j_inode)) { j_blocks = j_inode->i_size >> sb->s_blocksize_bits; overhead += EXT4_NUM_B2C(sbi, j_blocks); iput(j_inode); @@ -5740,18 +5740,18 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, journal_inode = ext4_iget(sb, journal_inum, EXT4_IGET_SPECIAL); if (IS_ERR(journal_inode)) { ext4_msg(sb, KERN_ERR, "no journal found"); - return NULL; + return ERR_CAST(journal_inode); } if (!journal_inode->i_nlink) { make_bad_inode(journal_inode); iput(journal_inode); ext4_msg(sb, KERN_ERR, "journal inode is deleted"); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) { ext4_msg(sb, KERN_ERR, "invalid journal inode"); iput(journal_inode); - return NULL; + return ERR_PTR(-EFSCORRUPTED); } ext4_debug("Journal inode found at %p: %lld bytes\n", @@ -5781,21 +5781,21 @@ static int ext4_journal_bmap(journal_t *journal, sector_t *block) return 0; } -static journal_t *ext4_get_journal(struct super_block *sb, - unsigned int journal_inum) +static journal_t *ext4_open_inode_journal(struct super_block *sb, + unsigned int journal_inum) { struct inode *journal_inode; journal_t *journal; journal_inode = ext4_get_journal_inode(sb, journal_inum); - if (!journal_inode) - return NULL; + if (IS_ERR(journal_inode)) + return ERR_CAST(journal_inode); journal = jbd2_journal_init_inode(journal_inode); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "Could not load journal inode"); iput(journal_inode); - return NULL; + return ERR_CAST(journal); } journal->j_private = sb; journal->j_bmap = ext4_journal_bmap; @@ -5813,6 +5813,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, ext4_fsblk_t sb_block; unsigned long offset; struct ext4_super_block *es; + int errno; bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, &ext4_holder_ops); @@ -5820,7 +5821,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); - return NULL; + return ERR_CAST(bdev); } blocksize = sb->s_blocksize; @@ -5828,6 +5829,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, if (blocksize < hblock) { ext4_msg(sb, KERN_ERR, "blocksize too small for journal device"); + errno = -EINVAL; goto out_bdev; } @@ -5838,6 +5840,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, if (!bh) { ext4_msg(sb, KERN_ERR, "couldn't read superblock of " "external journal"); + errno = -EINVAL; goto out_bdev; } @@ -5846,6 +5849,7 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, !(le32_to_cpu(es->s_feature_incompat) & EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) { ext4_msg(sb, KERN_ERR, "external journal has bad superblock"); + errno = -EFSCORRUPTED; goto out_bh; } @@ -5853,11 +5857,13 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && es->s_checksum != ext4_superblock_csum(sb, es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); + errno = -EFSCORRUPTED; goto out_bh; } if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); + errno = -EFSCORRUPTED; goto out_bh; } @@ -5870,31 +5876,34 @@ out_bh: brelse(bh); out_bdev: blkdev_put(bdev, sb); - return NULL; + return ERR_PTR(errno); } -static journal_t *ext4_get_dev_journal(struct super_block *sb, - dev_t j_dev) +static journal_t *ext4_open_dev_journal(struct super_block *sb, + dev_t j_dev) { journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; struct block_device *journal_bdev; + int errno = 0; journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (!journal_bdev) - return NULL; + if (IS_ERR(journal_bdev)) + return ERR_CAST(journal_bdev); journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); + errno = PTR_ERR(journal); goto out_bdev; } if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) { ext4_msg(sb, KERN_ERR, "External journal has more than one " "user (unsupported) - %d", be32_to_cpu(journal->j_superblock->s_nr_users)); + errno = -EINVAL; goto out_journal; } journal->j_private = sb; @@ -5906,7 +5915,7 @@ out_journal: jbd2_journal_destroy(journal); out_bdev: blkdev_put(journal_bdev, sb); - return NULL; + return ERR_PTR(errno); } static int ext4_load_journal(struct super_block *sb, @@ -5938,13 +5947,13 @@ static int ext4_load_journal(struct super_block *sb, } if (journal_inum) { - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; + journal = ext4_open_inode_journal(sb, journal_inum); + if (IS_ERR(journal)) + return PTR_ERR(journal); } else { - journal = ext4_get_dev_journal(sb, journal_dev); - if (!journal) - return -EINVAL; + journal = ext4_open_dev_journal(sb, journal_dev); + if (IS_ERR(journal)) + return PTR_ERR(journal); } journal_dev_ro = bdev_read_only(journal->j_dev); -- cgit v1.2.3 From 1524773425ae8113b0b782886366e68656b34e53 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Wed, 28 Jun 2023 21:20:11 +0800 Subject: ext4: fix unttached inode after power cut with orphan file feature enabled Running generic/475(filesystem consistent tests after power cut) could easily trigger unattached inode error while doing fsck: Unattached zero-length inode 39405. Clear? no Unattached inode 39405 Connect to /lost+found? no Above inconsistence is caused by following process: P1 P2 ext4_create inode = ext4_new_inode_start_handle // itable records nlink=1 ext4_add_nondir err = ext4_add_entry // ENOSPC ext4_append ext4_bread ext4_getblk ext4_map_blocks // returns ENOSPC drop_nlink(inode) // won't be updated into disk inode ext4_orphan_add(handle, inode) ext4_orphan_file_add ext4_journal_stop(handle) jbd2_journal_commit_transaction // commit success >> power cut << ext4_fill_super ext4_load_and_init_journal // itable records nlink=1 ext4_orphan_cleanup ext4_process_orphan if (inode->i_nlink) // true, inode won't be deleted Then, allocated inode will be reserved on disk and corresponds to no dentries, so e2fsck reports 'unattached inode' problem. The problem won't happen if orphan file feature is disabled, because ext4_orphan_add() will update disk inode in orphan list mode. There are several places not updating disk inode while putting inode into orphan area, such as ext4_add_nondir(), ext4_symlink() and whiteout in ext4_rename(). Fix it by updating inode into disk in all error branches of these places. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217605 Fixes: 02f310fcf47f ("ext4: Speedup ext4 orphan inode handling") Signed-off-by: Zhihao Cheng Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230628132011.650383-1-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/ext4') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6298cfaaa0bd..9a13431656cd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2799,6 +2799,7 @@ static int ext4_add_nondir(handle_t *handle, return err; } drop_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); return err; @@ -3436,6 +3437,7 @@ retry: err_drop_inode: clear_nlink(inode); + ext4_mark_inode_dirty(handle, inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); if (handle) @@ -4021,6 +4023,7 @@ end_rename: ext4_resetent(handle, &old, old.inode->i_ino, old_file_type); drop_nlink(whiteout); + ext4_mark_inode_dirty(handle, whiteout); ext4_orphan_add(handle, whiteout); } unlock_new_inode(whiteout); -- cgit v1.2.3 From 89cadf6e22a958014d09c901caf0cd2105780dbe Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Fri, 7 Jul 2023 18:55:16 +0800 Subject: ext4: change the type of blocksize in ext4_mb_init_cache() The return value type of i_blocksize() is 'unsigned int', so the type of blocksize has been modified from 'int' to 'unsigned int' to ensure data type consistency. Signed-off-by: Lu Hongfei Link: https://lore.kernel.org/r/20230707105516.9156-1-luhongfei@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3d5b0b71d7f5..96068d687d9d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1256,7 +1256,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) { ext4_group_t ngroups; - int blocksize; + unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; -- cgit v1.2.3 From 79ebf48c44b5ba05a98af23f8830883daf36f4d3 Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Fri, 7 Jul 2023 19:59:07 +0800 Subject: ext4: use sbi instead of EXT4_SB(sb) in ext4_mb_new_blocks_simple() Signed-off-by: Lu Hongfei Link: https://lore.kernel.org/r/20230707115907.26637-1-luhongfei@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 96068d687d9d..a807e8bf8664 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6092,7 +6092,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; - struct ext4_super_block *es = EXT4_SB(sb)->s_es; + struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || -- cgit v1.2.3 From a50bda147421e24c1a5d47ddcc0675360b7cb3ac Mon Sep 17 00:00:00 2001 From: Su Hui Date: Tue, 25 Jul 2023 12:33:11 +0800 Subject: ext4: mballoc: avoid garbage value from err clang's static analysis warning: fs/ext4/mballoc.c line 4178, column 6, Branch condition evaluates to a garbage value. err is uninitialized and will be judged when 'len <= 0' or it first enters the loop while the condition "!ext4_sb_block_valid()" is true. Although this can't make problems now, it's better to correct it. Signed-off-by: Su Hui Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/r/20230725043310.1227621-1-suhui@nfschina.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a807e8bf8664..1e4c667812a9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4087,7 +4087,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err; + int i, err = 0; int already; unsigned int clen, clen_changed, thisgrp_len; -- cgit v1.2.3 From b6c7d6dc8aebc04cefd342d6cccd24932be37d12 Mon Sep 17 00:00:00 2001 From: Cai Xinchen Date: Wed, 2 Aug 2023 03:00:25 +0000 Subject: ext4: remove unused function declaration These functions do not have its function implementation. So those function declaration is useless. Remove these Signed-off-by: Cai Xinchen Link: https://lore.kernel.org/r/20230802030025.173148-1-caixinchen1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb4d914ea888..ae458cde55d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2697,7 +2697,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); -extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); @@ -2853,7 +2852,6 @@ extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); -extern void ext4_check_inodes_bitmap(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); @@ -2896,7 +2894,6 @@ extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); -extern int ext4_mb_reserve_blocks(struct super_block *, int); extern void ext4_discard_preallocations(struct inode *, unsigned int); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); @@ -2976,7 +2973,6 @@ extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); -extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); @@ -3524,8 +3520,6 @@ extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); -extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, - unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); int ext4_readpage_inline(struct inode *inode, struct folio *folio); -- cgit v1.2.3 From 68228da51c9a436872a4ef4b5a7692e29f7e5bc7 Mon Sep 17 00:00:00 2001 From: Wang Jianjian Date: Thu, 3 Aug 2023 00:28:39 +0800 Subject: ext4: add correct group descriptors and reserved GDT blocks to system zone When setup_system_zone, flex_bg is not initialized so it is always 1. Use a new helper function, ext4_num_base_meta_blocks() which does not depend on sbi->s_log_groups_per_flex being initialized. [ Squashed two patches in the Link URL's below together into a single commit, which is simpler to review/understand. Also fix checkpatch warnings. --TYT ] Cc: stable@kernel.org Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/tencent_21AF0D446A9916ED5C51492CC6C9A0A77B05@qq.com Link: https://lore.kernel.org/r/tencent_D744D1450CC169AEA77FCF0A64719909ED05@qq.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 15 +++++++++++---- fs/ext4/block_validity.c | 8 ++++---- fs/ext4/ext4.h | 2 ++ 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1f72f977c6db..79b20d6ae39e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -913,11 +913,11 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) } /* - * This function returns the number of file system metadata clusters at + * This function returns the number of file system metadata blocks at * the beginning of a block group, including the reserved gdt blocks. */ -static unsigned ext4_num_base_meta_clusters(struct super_block *sb, - ext4_group_t block_group) +unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; @@ -935,8 +935,15 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb_meta(sb, block_group); } - return EXT4_NUM_B2C(sbi, num); + return num; } + +static unsigned int ext4_num_base_meta_clusters(struct super_block *sb, + ext4_group_t block_group) +{ + return EXT4_NUM_B2C(EXT4_SB(sb), ext4_num_base_meta_blocks(sb, block_group)); +} + /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5504f72bbbbe..6fe3c941b565 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -215,7 +215,6 @@ int ext4_setup_system_zone(struct super_block *sb) struct ext4_system_blocks *system_blks; struct ext4_group_desc *gdp; ext4_group_t i; - int flex_size = ext4_flex_bg_size(sbi); int ret; system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); @@ -223,12 +222,13 @@ int ext4_setup_system_zone(struct super_block *sb) return -ENOMEM; for (i=0; i < ngroups; i++) { + unsigned int meta_blks = ext4_num_base_meta_blocks(sb, i); + cond_resched(); - if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) { + if (meta_blks != 0) { ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1, 0); + meta_blks, 0); if (ret) goto err; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ae458cde55d1..2c2c3191bf41 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3079,6 +3079,8 @@ extern const char *ext4_decode_error(struct super_block *sb, int errno, extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); +extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, + ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, -- cgit v1.2.3 From 194505b55dd7899da114a4d47825204eefc0fff5 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 10 Aug 2023 12:55:59 -0400 Subject: ext4: drop dio overwrite only flag and associated warning The commit referenced below opened up concurrent unaligned dio under shared locking for pure overwrites. In doing so, it enabled use of the IOMAP_DIO_OVERWRITE_ONLY flag and added a warning on unexpected -EAGAIN returns as an extra precaution, since ext4 does not retry writes in such cases. The flag itself is advisory in this case since ext4 checks for unaligned I/Os and uses appropriate locking up front, rather than on a retry in response to -EAGAIN. As it turns out, the warning check is susceptible to false positives because there are scenarios where -EAGAIN can be expected from lower layers without necessarily having IOCB_NOWAIT set on the iocb. For example, one instance of the warning has been seen where io_uring sets IOCB_HIPRI, which in turn results in REQ_POLLED|REQ_NOWAIT on the bio. This results in -EAGAIN if the block layer is unable to allocate a request, etc. [Note that there is an outstanding patch to untangle REQ_POLLED and REQ_NOWAIT such that the latter relies on IOCB_NOWAIT, which would also address this instance of the warning.] Another instance of the warning has been reproduced by syzbot. A dio write is interrupted down in __get_user_pages_locked() waiting on the mm lock and returns -EAGAIN up the stack. If the iomap dio iteration layer has made no progress on the write to this point, -EAGAIN returns up to the filesystem and triggers the warning. This use of the overwrite flag in ext4 is precautionary and half-baked. I.e., ext4 doesn't actually implement overwrite checking in the iomap callbacks when the flag is set, so the only extra verification it provides are i_size checks in the generic iomap dio layer. Combined with the tendency for false positives, the added verification is not worth the extra trouble. Remove the flag, associated warning, and update the comments to document when concurrent unaligned dio writes are allowed and why said flag is not used. Cc: stable@kernel.org Reported-by: syzbot+5050ad0fb47527b1808a@syzkaller.appspotmail.com Reported-by: Pengfei Xu Fixes: 310ee0902b8d ("ext4: allow concurrent unaligned dio overwrites") Signed-off-by: Brian Foster Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230810165559.946222-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2071b1e4322c..e99cc17b6bd2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -476,6 +476,11 @@ restart: * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). + * + * Note that unaligned writes are allowed under shared lock so long as + * they are pure overwrites. Otherwise, concurrent unaligned writes risk + * data corruption due to partial block zeroing in the dio layer, and so + * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || @@ -492,21 +497,12 @@ restart: /* * Now that locking is settled, determine dio flags and exclusivity - * requirements. Unaligned writes are allowed under shared lock so long - * as they are pure overwrites. Set the iomap overwrite only flag as an - * added precaution in this case. Even though this is unnecessary, we - * can detect and warn on unexpected -EAGAIN if an unsafe unaligned - * write is ever submitted. - * - * Otherwise, concurrent unaligned writes risk data corruption due to - * partial block zeroing in the dio layer, and so the I/O must occur - * exclusively. The inode lock is already held exclusive if the write is - * non-overwrite or extending, so drain all outstanding dio and set the - * force wait dio flag. + * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce + * behavior already. The inode lock is already held exclusive if the + * write is non-overwrite or extending, so drain all outstanding dio and + * set the force wait dio flag. */ - if (*ilock_shared && unaligned_io) { - *dio_flags = IOMAP_DIO_OVERWRITE_ONLY; - } else if (!*ilock_shared && (unaligned_io || *extend)) { + if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; @@ -608,7 +604,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); - WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)); if (ret == -ENOTBLK) ret = 0; -- cgit v1.2.3 From ff0722de896eb278fca193888d22278c28f2782c Mon Sep 17 00:00:00 2001 From: Vitaliy Kuznetsov Date: Thu, 10 Aug 2023 18:38:52 +0400 Subject: ext4: add periodic superblock update check This patch introduces a mechanism to periodically check and update the superblock within the ext4 file system. The main purpose of this patch is to keep the disk superblock up to date. The update will be performed if more than one hour has passed since the last update, and if more than 16MB of data have been written to disk. This check and update is performed within the ext4_journal_commit_callback function, ensuring that the superblock is written while the disk is active, rather than based on a timer that may trigger during disk idle periods. Discussion https://www.spinics.net/lists/linux-ext4/msg85865.html Signed-off-by: Vitaliy Kuznetsov Link: https://lore.kernel.org/r/20230810143852.40228-1-vk.en.mail@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6edf7deeb2db..bf0cfdffa9d0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -433,6 +433,57 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) +#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ + +/* + * The ext4_maybe_update_superblock() function checks and updates the + * superblock if needed. + * + * This function is designed to update the on-disk superblock only under + * certain conditions to prevent excessive disk writes and unnecessary + * waking of the disk from sleep. The superblock will be updated if: + * 1. More than an hour has passed since the last superblock update, and + * 2. More than 16MB have been written since the last superblock update. + * + * @sb: The superblock + */ +static void ext4_maybe_update_superblock(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + journal_t *journal = sbi->s_journal; + time64_t now; + __u64 last_update; + __u64 lifetime_write_kbytes; + __u64 diff_size; + + if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || + !journal || (journal->j_flags & JBD2_UNMOUNT)) + return; + + now = ktime_get_real_seconds(); + last_update = ext4_get_tstamp(es, s_wtime); + + if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + return; + + lifetime_write_kbytes = sbi->s_kbytes_written + + ((part_stat_read(sb->s_bdev, sectors[STAT_WRITE]) - + sbi->s_sectors_written_start) >> 1); + + /* Get the number of kilobytes not written to disk to account + * for statistics and compare with a multiple of 16 MB. This + * is used to determine when the next superblock commit should + * occur (i.e. not more often than once per 16MB if there was + * less written in an hour). + */ + diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); + + if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + schedule_work(&EXT4_SB(sb)->s_error_work); +} + /* * The del_gendisk() function uninitializes the disk-specific data * structures, including the bdi structure, without telling anyone @@ -459,6 +510,7 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); + ext4_maybe_update_superblock(sb); spin_lock(&sbi->s_md_lock); while (!list_empty(&txn->t_private_list)) { @@ -715,6 +767,7 @@ static void flush_stashed_error_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; + bool call_notify_err; handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; @@ -722,6 +775,10 @@ static void flush_stashed_error_work(struct work_struct *work) jbd2_journal_stop(handle); goto write_directly; } + + if (sbi->s_add_error_count > 0) + call_notify_err = true; + ext4_update_super(sbi->s_sb); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " @@ -735,7 +792,10 @@ static void flush_stashed_error_work(struct work_struct *work) goto write_directly; } jbd2_journal_stop(handle); - ext4_notify_error_sysfs(sbi); + + if (call_notify_err) + ext4_notify_error_sysfs(sbi); + return; } write_directly: -- cgit v1.2.3 From bb15cea20f211e110150e528fca806f38d5789e0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 22 Aug 2023 23:43:38 -0400 Subject: ext4: rename s_error_work to s_sb_upd_work The most common use that s_error_work will get scheduled is now the periodic update of the superblock. So rename it to s_sb_upd_work. Also rename the function flush_stashed_error_work() to update_super_work(). Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 9 ++++++--- fs/ext4/super.c | 32 ++++++++++++++++---------------- 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2c2c3191bf41..84618c46f239 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1698,10 +1698,13 @@ struct ext4_sb_info { const char *s_last_error_func; time64_t s_last_error_time; /* - * If we are in a context where we cannot update error information in - * the on-disk superblock, we queue this work to do it. + * If we are in a context where we cannot update the on-disk + * superblock, we queue the work here. This is used to update + * the error information in the superblock, and for periodic + * updates of the superblock called from the commit callback + * function. */ - struct work_struct s_error_work; + struct work_struct s_sb_upd_work; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index bf0cfdffa9d0..91f20afa1d71 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -481,7 +481,7 @@ static void ext4_maybe_update_superblock(struct super_block *sb) diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } /* @@ -723,7 +723,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * defer superblock flushing to a workqueue. */ if (continue_fs && journal) - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); } @@ -750,10 +750,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_flags |= SB_RDONLY; } -static void flush_stashed_error_work(struct work_struct *work) +static void update_super_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, - s_error_work); + s_sb_upd_work); journal_t *journal = sbi->s_journal; handle_t *handle; @@ -1078,7 +1078,7 @@ __acquires(bitlock) if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } return; } @@ -1318,10 +1318,10 @@ static void ext4_put_super(struct super_block *sb) * Unregister sysfs before destroying jbd2 journal. * Since we could still access attr_journal_task attribute via sysfs * path which could have sbi->s_journal->j_task as NULL - * Unregister sysfs before flush sbi->s_error_work. + * Unregister sysfs before flush sbi->s_sb_upd_work. * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If * read metadata verify failed then will queue error work. - * flush_stashed_error_work will call start_this_handle may trigger + * update_super_work will call start_this_handle may trigger * BUG_ON. */ ext4_unregister_sysfs(sb); @@ -1333,7 +1333,7 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); @@ -4998,8 +4998,8 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before destroying the journal. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; return -EINVAL; @@ -5322,7 +5322,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) timer_setup(&sbi->s_err_report, print_daily_error_info, 0); spin_lock_init(&sbi->s_error_lock); - INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + INIT_WORK(&sbi->s_sb_upd_work, update_super_work); err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); if (err) @@ -5666,16 +5666,16 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_error_work before journal destroy. */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before journal destroy. */ + flush_work(&sbi->s_sb_upd_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: - /* flush s_error_work before sbi destroy */ - flush_work(&sbi->s_error_work); + /* flush s_sb_upd_work before sbi destroy */ + flush_work(&sbi->s_sb_upd_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); ext4_group_desc_free(sbi); @@ -6551,7 +6551,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } /* Flush outstanding errors before changing fs state */ - flush_work(&sbi->s_error_work); + flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { if (ext4_forced_shutdown(sb)) { -- cgit v1.2.3 From 03de20bed203b0819225d4de98353c1f8755a1dd Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 10 Aug 2023 23:43:33 +0800 Subject: ext4: do not mark inode dirty every time when appending using delalloc In the delalloc append write scenario, if inode's i_size is extended due to buffer write, there are delalloc writes pending in the range up to i_size, and no need to touch i_disksize since writeback will push i_disksize up to i_size eventually. Offers significant performance improvement in high-frequency append write scenarios. I conducted tests in my 32-core environment by launching 32 concurrent threads to append write to the same file. Each write operation had a length of 1024 bytes and was repeated 100000 times. Without using this patch, the test was completed in 7705 ms. However, with this patch, the test was completed in 5066 ms, resulting in a performance improvement of 34%. Moreover, in test scenarios of Kafka version 2.6.2, using packet size of 2K, with this patch resulted in a 10% performance improvement. Signed-off-by: Liu Song Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230810154333.84921-1-liusong@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 88 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 26 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b9003840bc1..c5d8f8933c8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2935,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2961,30 +3020,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* -- cgit v1.2.3 From 0f6bc57971c63f7352c3564d19a5dc707fe8332a Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Sat, 12 Aug 2023 15:18:39 +0800 Subject: ext4: use LIST_HEAD() to initialize the list_head in mballoc.c Use LIST_HEAD() to initialize the list_head instead of open-coding it. Signed-off-by: Ruan Jinjie Link: https://lore.kernel.org/r/20230812071839.3481909-1-ruanjinjie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1e4c667812a9..c91db9f57524 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3504,11 +3504,10 @@ static void ext4_discard_work(struct work_struct *work) struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; - INIT_LIST_HEAD(&discard_list); spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); @@ -3882,12 +3881,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct list_head freed_data_list; + LIST_HEAD(freed_data_list); struct list_head *cut_pos = NULL; bool wake; - INIT_LIST_HEAD(&freed_data_list); - spin_lock(&sbi->s_md_lock); list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { if (entry->efd_tid != commit_tid) @@ -5414,7 +5411,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; @@ -5443,7 +5440,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, goto out_dbg; } - INIT_LIST_HEAD(&list); ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { @@ -5524,7 +5520,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; - struct list_head list; + LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; @@ -5541,8 +5537,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active), needed); - INIT_LIST_HEAD(&list); - if (needed == 0) needed = UINT_MAX; @@ -5858,13 +5852,11 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, { ext4_group_t group = 0; struct ext4_buddy e4b; - struct list_head discard_list; + LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); - INIT_LIST_HEAD(&discard_list); - spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, -- cgit v1.2.3 From 8216776ccff6fcd40e3fdaa109aa4150ebe760b3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:01 -0700 Subject: ext4: reject casefold inode flag without casefold feature It is invalid for the casefold inode flag to be set without the casefold superblock feature flag also being set. e2fsck already considers this case to be invalid and handles it by offering to clear the casefold flag on the inode. __ext4_iget() also already considered this to be invalid, sort of, but it only got so far as logging an error message; it didn't actually reject the inode. Make it reject the inode so that other code doesn't have to handle this case. This matches what f2fs does. Note: we could check 's_encoding != NULL' instead of ext4_has_feature_casefold(). This would make the check robust against the casefold feature being enabled by userspace writing to the page cache of the mounted block device. However, it's unsolvable in general for filesystems to be robust against concurrent writes to the page cache of the mounted block device. Though this very particular scenario involving the casefold feature is solvable, we should not pretend that we can support this model, so let's just check the casefold feature. tune2fs already forbids enabling casefold on a mounted filesystem. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5d8f8933c8c..6c490f05e2ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4974,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; -- cgit v1.2.3 From b81427939590450172716093dafdda8ef52e020f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:02 -0700 Subject: ext4: remove redundant checks of s_encoding Now that ext4 does not allow inodes with the casefold flag to be instantiated when unsupported, it's unnecessary to repeatedly check for support later on during random filesystem operations. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-3-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/hash.c | 2 +- fs/ext4/namei.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 46c3423ddfa1..deabe29da7fb 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -300,7 +300,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um && + if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 9a13431656cd..c0f0b4e2413b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1445,7 +1445,7 @@ int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct dx_hash_info *hinfo = &name->hinfo; int len; - if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding || + if (!IS_CASEFOLDED(dir) || (IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir))) { cf_name->name = NULL; return 0; @@ -1496,7 +1496,7 @@ static bool ext4_match(struct inode *parent, #endif #if IS_ENABLED(CONFIG_UNICODE) - if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent) && + if (IS_CASEFOLDED(parent) && (!IS_ENCRYPTED(parent) || fscrypt_has_encryption_key(parent))) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, @@ -2393,7 +2393,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, #if IS_ENABLED(CONFIG_UNICODE) if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && - sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) + utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif -- cgit v1.2.3 From 768d612f79822d30a1e7d132a4d4b05337ce42ec Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Tue, 15 Aug 2023 15:08:08 +0800 Subject: ext4: fix slab-use-after-free in ext4_es_insert_extent() Yikebaer reported an issue: ================================================================== BUG: KASAN: slab-use-after-free in ext4_es_insert_extent+0xc68/0xcb0 fs/ext4/extents_status.c:894 Read of size 4 at addr ffff888112ecc1a4 by task syz-executor/8438 CPU: 1 PID: 8438 Comm: syz-executor Not tainted 6.5.0-rc5 #1 Call Trace: [...] kasan_report+0xba/0xf0 mm/kasan/report.c:588 ext4_es_insert_extent+0xc68/0xcb0 fs/ext4/extents_status.c:894 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] Allocated by task 8438: [...] kmem_cache_zalloc include/linux/slab.h:693 [inline] __es_alloc_extent fs/ext4/extents_status.c:469 [inline] ext4_es_insert_extent+0x672/0xcb0 fs/ext4/extents_status.c:873 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] Freed by task 8438: [...] kmem_cache_free+0xec/0x490 mm/slub.c:3823 ext4_es_try_to_merge_right fs/ext4/extents_status.c:593 [inline] __es_insert_extent+0x9f4/0x1440 fs/ext4/extents_status.c:802 ext4_es_insert_extent+0x2ca/0xcb0 fs/ext4/extents_status.c:882 ext4_map_blocks+0x92a/0x16f0 fs/ext4/inode.c:680 ext4_alloc_file_blocks.isra.0+0x2df/0xb70 fs/ext4/extents.c:4462 ext4_zero_range fs/ext4/extents.c:4622 [inline] ext4_fallocate+0x251c/0x3ce0 fs/ext4/extents.c:4721 [...] ================================================================== The flow of issue triggering is as follows: 1. remove es raw es es removed es1 |-------------------| -> |----|.......|------| 2. insert es es insert es1 merge with es es1 merge with es and free es1 |----|.......|------| -> |------------|------| -> |-------------------| es merges with newes, then merges with es1, frees es1, then determines if es1->es_len is 0 and triggers a UAF. The code flow is as follows: ext4_es_insert_extent es1 = __es_alloc_extent(true); es2 = __es_alloc_extent(true); __es_remove_extent(inode, lblk, end, NULL, es1) __es_insert_extent(inode, &newes, es1) ---> insert es1 to es tree __es_insert_extent(inode, &newes, es2) ext4_es_try_to_merge_right ext4_es_free_extent(inode, es1) ---> es1 is freed if (es1 && !es1->es_len) // Trigger UAF by determining if es1 is used. We determine whether es1 or es2 is used immediately after calling __es_remove_extent() or __es_insert_extent() to avoid triggering a UAF if es1 or es2 is freed. Reported-by: Yikebaer Aizezi Closes: https://lore.kernel.org/lkml/CALcu4raD4h9coiyEBL4Bm0zjDwxC2CyPiTwsP3zFuhot6y9Beg@mail.gmail.com Fixes: 2a69c450083d ("ext4: using nofail preallocation in ext4_es_insert_extent()") Cc: stable@kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230815070808.3377171-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 44 ++++++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'fs/ext4') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 9b5b8951afb4..6f7de14c0fa8 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -878,23 +878,29 @@ retry: err1 = __es_remove_extent(inode, lblk, end, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 == -ENOMEM && !ext4_es_must_keep(&newes)) err2 = 0; if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && (status & EXTENT_STATUS_WRITTEN || status & EXTENT_STATUS_UNWRITTEN)) __revise_pending(inode, lblk, len); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) @@ -1491,8 +1497,12 @@ retry: */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end, &reserved, es); - if (es && !es->es_len) - __es_free_extent(es); + /* Free preallocated extent if it didn't get used. */ + if (es) { + if (!es->es_len) + __es_free_extent(es); + es = NULL; + } write_unlock(&EXT4_I(inode)->i_es_lock); if (err) goto retry; @@ -2047,19 +2057,25 @@ retry: err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); if (err1 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es1) { + if (!es1->es_len) + __es_free_extent(es1); + es1 = NULL; + } err2 = __es_insert_extent(inode, &newes, es2); if (err2 != 0) goto error; + /* Free preallocated extent if it didn't get used. */ + if (es2) { + if (!es2->es_len) + __es_free_extent(es2); + es2 = NULL; + } if (allocated) __insert_pending(inode, lblk); - - /* es is pre-allocated but not used, free it. */ - if (es1 && !es1->es_len) - __es_free_extent(es1); - if (es2 && !es2->es_len) - __es_free_extent(es2); error: write_unlock(&EXT4_I(inode)->i_es_lock); if (err1 || err2) -- cgit v1.2.3