From 7e0d0d44001506bc42932b5e37baaab84f0397cf Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Tue, 15 Mar 2022 17:14:54 +0530 Subject: ext4: get rid of unused DEFAULT_MB_OPTIMIZE_SCAN After recent changes to the mb_optimize_scan mount option the DEFAULT_MB_OPTIMIZE_SCAN is no longer needed so get rid of it. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220315114454.104182-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1466fbdbc8e3..218f6bc83368 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1867,7 +1867,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { }; #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -#define DEFAULT_MB_OPTIMIZE_SCAN (-1) static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" -- cgit v1.2.3 From f4534c9fc94d22383f187b9409abb3f9df2e3db3 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Sat, 26 Mar 2022 14:53:51 +0800 Subject: ext4: fix warning in ext4_handle_inode_extension We got issue as follows: EXT4-fs error (device loop0) in ext4_reserve_inode_write:5741: Out of memory EXT4-fs error (device loop0): ext4_setattr:5462: inode #13: comm syz-executor.0: mark_inode_dirty error EXT4-fs error (device loop0) in ext4_setattr:5519: Out of memory EXT4-fs error (device loop0): ext4_ind_map_blocks:595: inode #13: comm syz-executor.0: Can't allocate blocks for non-extent mapped inodes with bigalloc ------------[ cut here ]------------ WARNING: CPU: 1 PID: 4361 at fs/ext4/file.c:301 ext4_file_write_iter+0x11c9/0x1220 Modules linked in: CPU: 1 PID: 4361 Comm: syz-executor.0 Not tainted 5.10.0+ #1 RIP: 0010:ext4_file_write_iter+0x11c9/0x1220 RSP: 0018:ffff924d80b27c00 EFLAGS: 00010282 RAX: ffffffff815a3379 RBX: 0000000000000000 RCX: 000000003b000000 RDX: ffff924d81601000 RSI: 00000000000009cc RDI: 00000000000009cd RBP: 000000000000000d R08: ffffffffbc5a2c6b R09: 0000902e0e52a96f R10: ffff902e2b7c1b40 R11: ffff902e2b7c1b40 R12: 000000000000000a R13: 0000000000000001 R14: ffff902e0e52aa10 R15: ffffffffffffff8b FS: 00007f81a7f65700(0000) GS:ffff902e3bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffff600400 CR3: 000000012db88001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_iter_readv_writev+0x2e5/0x360 do_iter_write+0x112/0x4c0 do_pwritev+0x1e5/0x390 __x64_sys_pwritev2+0x7e/0xa0 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Above issue may happen as follows: Assume inode.i_size=4096 EXT4_I(inode)->i_disksize=4096 step 1: set inode->i_isize = 8192 ext4_setattr if (attr->ia_size != inode->i_size) EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty ext4_reserve_inode_write ext4_get_inode_loc __ext4_get_inode_loc sb_getblk --> return -ENOMEM ... if (!error) ->will not update i_size i_size_write(inode, attr->ia_size); Now: inode.i_size=4096 EXT4_I(inode)->i_disksize=8192 step 2: Direct write 4096 bytes ext4_file_write_iter ext4_dio_write_iter iomap_dio_rw ->return error if (extend) ext4_handle_inode_extension WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); ->Then trigger warning. To solve above issue, if mark inode dirty failed in ext4_setattr just set 'EXT4_I(inode)->i_disksize' with old value. Signed-off-by: Ye Bin Link: https://lore.kernel.org/r/20220326065351.761952-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 646ece9b3455..15165c87c915 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5398,6 +5398,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; + loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { @@ -5469,6 +5470,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode->i_sb->s_blocksize_bits); down_write(&EXT4_I(inode)->i_data_sem); + old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5480,6 +5482,8 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, */ if (!error) i_size_write(inode, attr->ia_size); + else + EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) -- cgit v1.2.3 From c30365b90ab26fa991751119cde047312d370cab Mon Sep 17 00:00:00 2001 From: Yu Zhe Date: Fri, 1 Apr 2022 01:13:21 -0700 Subject: ext4: remove unnecessary type castings remove unnecessary void* type castings. Signed-off-by: Yu Zhe Link: https://lore.kernel.org/r/20220401081321.73735-1-yuzhe@nfschina.com Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 2 +- fs/ext4/fast_commit.c | 10 +++++----- fs/ext4/inline.c | 6 +++--- fs/ext4/mballoc.c | 2 +- fs/ext4/mmp.c | 2 +- fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index a6bb86f52b9a..fc982d2dc071 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -648,7 +648,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, unsigned int offset = 0; char *top; - de = (struct ext4_dir_entry_2 *)buf; + de = buf; top = buf + buf_size; while ((char *) de < top) { if (ext4_check_dir_entry(dir, NULL, de, bh, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 3d72565ec6e8..7b0b57be8615 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -970,7 +970,7 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; @@ -1004,7 +1004,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; @@ -1031,7 +1031,7 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; @@ -1093,7 +1093,7 @@ lock_and_exit: static int ext4_fc_perform_commit(journal_t *journal) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; @@ -1198,7 +1198,7 @@ static void ext4_fc_update_stats(struct super_block *sb, int status, */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { - struct super_block *sb = (struct super_block *)(journal->j_private); + struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 9c076262770d..13a63e3639b1 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1083,14 +1083,14 @@ static void ext4_update_final_de(void *de_buf, int old_size, int new_size) void *limit; int de_len; - de = (struct ext4_dir_entry_2 *)de_buf; + de = de_buf; if (old_size) { limit = de_buf + old_size; do { prev_de = de; de_len = ext4_rec_len_from_disk(de->rec_len, old_size); de_buf += de_len; - de = (struct ext4_dir_entry_2 *)de_buf; + de = de_buf; } while (de_buf < limit); prev_de->rec_len = ext4_rec_len_to_disk(de_len + new_size - @@ -1155,7 +1155,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, * First create "." and ".." and then copy the dir information * back to the block. */ - de = (struct ext4_dir_entry_2 *)target; + de = target; de = ext4_init_dot_dotdot(inode, de, inode->i_sb->s_blocksize, csum_size, le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 252c168454c7..dcb81352b864 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2919,7 +2919,7 @@ const struct seq_operations ext4_mb_seq_groups_ops = { int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { - struct super_block *sb = (struct super_block *)seq->private; + struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index cebea4270817..79d05e464c43 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -127,7 +127,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, */ static int kmmpd(void *data) { - struct super_block *sb = (struct super_block *) data; + struct super_block *sb = data; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *bh = EXT4_SB(sb)->s_mmp_bh; struct mmp_struct *mmp; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 767b4bfe39c3..5675d3b38032 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2031,7 +2031,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, unsigned int offset = 0; char *top; - de = (struct ext4_dir_entry_2 *)buf; + de = buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, @@ -2587,7 +2587,7 @@ int ext4_generic_delete_entry(struct inode *dir, i = 0; pde = NULL; - de = (struct ext4_dir_entry_2 *)entry_buf; + de = entry_buf; while (i < buf_size - csum_size) { if (ext4_check_dir_entry(dir, NULL, de, bh, entry_buf, buf_size, i)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 218f6bc83368..b70ab3d3165d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1397,7 +1397,7 @@ static void ext4_destroy_inode(struct inode *inode) static void init_once(void *foo) { - struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; + struct ext4_inode_info *ei = foo; INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); @@ -3836,7 +3836,7 @@ static struct task_struct *ext4_lazyinit_task; */ static int ext4_lazyinit_thread(void *arg) { - struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; + struct ext4_lazy_init *eli = arg; struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; -- cgit v1.2.3 From fac88735278b4508ab8de505530be8413fd32fc2 Mon Sep 17 00:00:00 2001 From: Chin Yik Ming Date: Sat, 2 Apr 2022 02:07:44 -0700 Subject: ext4: fix spelling errors in comments 'functoin' and 'entres' should be 'function' and 'entries' respectively Signed-off-by: Chin Yik Ming Link: https://lore.kernel.org/r/20220402090744.8918-1-yikming2222@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fc982d2dc071..3985f8c33f95 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -412,7 +412,7 @@ struct fname { }; /* - * This functoin implements a non-recursive way of freeing all of the + * This function implements a non-recursive way of freeing all of the * nodes in the red-black tree. */ static void free_rb_tree_fname(struct rb_root *root) @@ -515,7 +515,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, /* * This is a helper function for ext4_dx_readdir. It calls filldir - * for all entres on the fname linked list. (Normally there is only + * for all entries on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ static int call_filldir(struct file *file, struct dir_context *ctx, -- cgit v1.2.3 From af2b327581582a959f1923eca9cafee5b29a7399 Mon Sep 17 00:00:00 2001 From: Jinke Han Date: Mon, 4 Apr 2022 23:22:43 +0800 Subject: ext4: remove unnecessary code in __mb_check_buddy When enter elseif branch, the the MB_CHECK_ASSERT will never fail. In addtion, the only illegal combination is 0/0, which can be caught by the first if branch. Signed-off-by: Jinke Han Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220404152243.13556-1-hanjinke.666@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dcb81352b864..3e715b837e70 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -695,13 +695,10 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { - /* only single bit in buddy2 may be 1 */ + /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); - } else if (!mb_test_bit((i << 1) + 1, buddy2)) { - MB_CHECK_ASSERT( - mb_test_bit(i << 1, buddy2)); } continue; } -- cgit v1.2.3 From 784a09951c1d8383498c0df091a37db612bebfc7 Mon Sep 17 00:00:00 2001 From: Lv Ruyi Date: Mon, 11 Apr 2022 03:23:37 +0000 Subject: ext4: remove unnecessary conditionals iput() has already handled null and non-null parameter, so it is no need to use if(). Reported-by: Zeal Robot Signed-off-by: Lv Ruyi Link: https://lore.kernel.org/r/20220411032337.2517465-1-lv.ruyi@zte.com.cn Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 3 +-- fs/ext4/namei.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7b0b57be8615..795a60ad1897 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1659,8 +1659,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl, set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: - if (inode) - iput(inode); + iput(inode); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5675d3b38032..dfe5514035c1 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3363,8 +3363,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); - if (inode) - iput(inode); + iput(inode); goto out_free_encrypted_link; err_drop_inode: -- cgit v1.2.3 From 4808cb5b98b436f1110d83c65541dd43beb45f63 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Tue, 12 Apr 2022 22:53:20 +0800 Subject: ext4: add unmount filesystem message Now that we have kernel message at mount time, system administrator could acquire the mount time, device and options easily. But we don't have corresponding unmounting message at umount time, so we cannot know if someone umount a filesystem easily. Some of the modern filesystems (e.g. xfs) have the umounting kernel message, so add one for ext4 filesystem for convenience. EXT4-fs (sdb): mounted filesystem with ordered data mode. Quota mode: none. EXT4-fs (sdb): unmounting filesystem. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220412145320.2669897-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b70ab3d3165d..b511203f7111 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1211,6 +1211,9 @@ static void ext4_put_super(struct super_block *sb) */ ext4_unregister_sysfs(sb); + if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs unmount")) + ext4_msg(sb, KERN_INFO, "unmounting filesystem."); + ext4_unregister_li_request(sb); ext4_quota_off_umount(sb); -- cgit v1.2.3 From 0be698ecbe4471fcad80e81ec6a05001421041b3 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Thu, 14 Apr 2022 10:52:23 +0800 Subject: ext4: fix use-after-free in ext4_rename_dir_prepare We got issue as follows: EXT4-fs (loop0): mounted filesystem without journal. Opts: ,errors=continue ext4_get_first_dir_block: bh->b_data=0xffff88810bee6000 len=34478 ext4_get_first_dir_block: *parent_de=0xffff88810beee6ae bh->b_data=0xffff88810bee6000 ext4_rename_dir_prepare: [1] parent_de=0xffff88810beee6ae ================================================================== BUG: KASAN: use-after-free in ext4_rename_dir_prepare+0x152/0x220 Read of size 4 at addr ffff88810beee6ae by task rep/1895 CPU: 13 PID: 1895 Comm: rep Not tainted 5.10.0+ #241 Call Trace: dump_stack+0xbe/0xf9 print_address_description.constprop.0+0x1e/0x220 kasan_report.cold+0x37/0x7f ext4_rename_dir_prepare+0x152/0x220 ext4_rename+0xf44/0x1ad0 ext4_rename2+0x11c/0x170 vfs_rename+0xa84/0x1440 do_renameat2+0x683/0x8f0 __x64_sys_renameat+0x53/0x60 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f45a6fc41c9 RSP: 002b:00007ffc5a470218 EFLAGS: 00000246 ORIG_RAX: 0000000000000108 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f45a6fc41c9 RDX: 0000000000000005 RSI: 0000000020000180 RDI: 0000000000000005 RBP: 00007ffc5a470240 R08: 00007ffc5a470160 R09: 0000000020000080 R10: 00000000200001c0 R11: 0000000000000246 R12: 0000000000400bb0 R13: 00007ffc5a470320 R14: 0000000000000000 R15: 0000000000000000 The buggy address belongs to the page: page:00000000440015ce refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x10beee flags: 0x200000000000000() raw: 0200000000000000 ffffea00043ff4c8 ffffea0004325608 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88810beee580: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810beee600: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >ffff88810beee680: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ ffff88810beee700: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810beee780: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Disabling lock debugging due to kernel taint ext4_rename_dir_prepare: [2] parent_de->inode=3537895424 ext4_rename_dir_prepare: [3] dir=0xffff888124170140 ext4_rename_dir_prepare: [4] ino=2 ext4_rename_dir_prepare: ent->dir->i_ino=2 parent=-757071872 Reason is first directory entry which 'rec_len' is 34478, then will get illegal parent entry. Now, we do not check directory entry after read directory block in 'ext4_get_first_dir_block'. To solve this issue, check directory entry in 'ext4_get_first_dir_block'. [ Trigger an ext4_error() instead of just warning if the directory is missing a '.' or '..' entry. Also make sure we return an error code if the file system is corrupted. -TYT ] Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220414025223.4113128-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/namei.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dfe5514035c1..b202626391ff 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3454,6 +3454,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh; if (!ext4_has_inline_data(inode)) { + struct ext4_dir_entry_2 *de; + unsigned int offset; + /* The first directory block must not be a hole, so * treat it as DIRENT_HTREE */ @@ -3462,9 +3465,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, *retval = PTR_ERR(bh); return NULL; } - *parent_de = ext4_next_entry( - (struct ext4_dir_entry_2 *)bh->b_data, - inode->i_sb->s_blocksize); + + de = (struct ext4_dir_entry_2 *) bh->b_data; + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, 0) || + le32_to_cpu(de->inode) != inode->i_ino || + strcmp(".", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '.'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + offset = ext4_rec_len_from_disk(de->rec_len, + inode->i_sb->s_blocksize); + de = ext4_next_entry(de, inode->i_sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '..'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + *parent_de = de; + return bh; } -- cgit v1.2.3 From d63c00ea435a5352f486c259665a4ced60399421 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 17 Apr 2022 20:03:15 +0300 Subject: ext4: mark group as trimmed only if it was fully scanned Otherwise nonaligned fstrim calls will works inconveniently for iterative scanners, for example: // trim [0,16MB] for group-1, but mark full group as trimmed fstrim -o $((1024*1024*128)) -l $((1024*1024*16)) ./m // handle [16MB,16MB] for group-1, do nothing because group already has the flag. fstrim -o $((1024*1024*144)) -l $((1024*1024*16)) ./m [ Update function documentation for ext4_trim_all_free -- TYT ] Signed-off-by: Dmitry Monakhov Link: https://lore.kernel.org/r/1650214995-860245-1-git-send-email-dmtrmonakhov@yandex-team.ru Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3e715b837e70..bc90635b757c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6395,6 +6395,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count + * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6404,7 +6405,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) + ext4_grpblk_t minblocks, bool set_trimmed) { struct ext4_buddy e4b; int ret; @@ -6423,7 +6424,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) { ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0) + if (ret >= 0 && set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } else { ret = 0; @@ -6460,6 +6461,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); + bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -6478,8 +6480,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks) + if (end >= max_blks - 1) { end = max_blks - 1; + eof = true; + } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -6493,6 +6497,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + whole_group = true; for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); @@ -6509,12 +6514,13 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) + if (group == last_group) { end = last_cluster; - + whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; + } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen); + end, minlen, whole_group); if (cnt < 0) { ret = cnt; break; -- cgit v1.2.3 From e4e58e5df309d695799c494958962100a4c25039 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Mon, 18 Apr 2022 14:05:45 +0530 Subject: ext4: fix journal_ioprio mount option handling In __ext4_super() we always overwrote the user specified journal_ioprio value with a default value, expecting parse_apply_sb_mount_options() to later correctly set ctx->journal_ioprio to the user specified value. However, if parse_apply_sb_mount_options() returned early because of empty sbi->es_s->s_mount_opts, the correct journal_ioprio value was never set. This patch fixes __ext4_super() to only use the default value if the user has not specified any value for journal_ioprio. Similarly, the remount behavior was to either use journal_ioprio value specified during initial mount, or use the default value irrespective of the journal_ioprio value specified during remount. This patch modifies this to first check if a new value for ioprio has been passed during remount and apply it. If no new value is passed, use the value specified during initial mount. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani Tested-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220418083545.45778-1-ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/super.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b511203f7111..4b0ea8df1f5c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4411,7 +4411,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) int silent = fc->sb_flags & SB_SILENT; /* Set defaults for the variables that will be set during parsing */ - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -6278,7 +6279,6 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) char *to_free[EXT4_MAXQUOTAS]; #endif - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; /* Store the original options */ old_sb_flags = sb->s_flags; @@ -6304,9 +6304,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } else old_opts.s_qf_names[i] = NULL; #endif - if (sbi->s_journal && sbi->s_journal->j_task->io_context) - ctx->journal_ioprio = - sbi->s_journal->j_task->io_context->ioprio; + if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) { + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + ctx->journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; + else + ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + + } ext4_apply_options(fc, sb); -- cgit v1.2.3 From 9558cf14e8d2149a8df402b74041a99835801932 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sun, 24 Apr 2022 22:09:35 +0800 Subject: ext4: add nowait mode for ext4_getblk() Current ext4_getblk() might sleep if some resources are not valid or could be race with a concurrent extents modifing procedure. So we cannot call ext4_getblk() and ext4_map_blocks() to get map blocks in the atomic context in some fast path (e.g. the upcoming procedure of getting symlink external block in the RCU context), even if the map extents have already been check and cached. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20220424140936.1898920-2-yi.zhang@huawei.com --- fs/ext4/ext4.h | 2 ++ fs/ext4/inode.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a743b1e3b89e..797bc572d6fb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -673,6 +673,8 @@ enum { /* Caller will submit data before dropping transaction handle. This * allows jbd2 to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Caller is in the atomic contex, find extent if it has been cached */ +#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * The bit position of these flags must not overlap with any of the diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 15165c87c915..f7b4787d6679 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -545,12 +545,21 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, } else { BUG(); } + + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } + /* + * In the query cache no-wait mode, nothing we can do more if we + * cannot find extent in the cache. + */ + if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) + return 0; /* * Try to see if we can get the block without requesting a new @@ -837,10 +846,12 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; + bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); + ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; @@ -851,6 +862,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, if (err < 0) return ERR_PTR(err); + if (nowait) + return sb_find_get_block(inode->i_sb, map.m_pblk); + bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 6493792d3299b3e33f887ef8a150099d271faf9c Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Sun, 24 Apr 2022 22:09:36 +0800 Subject: ext4: convert symlink external data block mapping to bdev Symlink's external data block is one kind of metadata block, and now that almost all ext4 metadata block's page cache (e.g. directory blocks, quota blocks...) belongs to bdev backing inode except the symlink. It is essentially worked in data=journal mode like other regular file's data block because probably in order to make it simple for generic VFS code handling symlinks or some other historical reasons, but the logic of creating external data block in ext4_symlink() is complicated. and it also make things confused if user do not want to let the filesystem worked in data=journal mode. This patch convert the final exceptional case and make things clean, move the mapping of the symlink's external data block to bdev like any other metadata block does. Signed-off-by: Zhang Yi Link: https://lore.kernel.org/r/20220424140936.1898920-3-yi.zhang@huawei.com --- fs/ext4/inode.c | 9 +--- fs/ext4/namei.c | 123 ++++++++++++++++++++++++------------------------------ fs/ext4/symlink.c | 51 ++++++++++++++++++---- 3 files changed, 100 insertions(+), 83 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f7b4787d6679..5948bbba28e3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -199,8 +199,7 @@ void ext4_evict_inode(struct inode *inode) */ if (inode->i_ino != EXT4_JOURNAL_INO && ext4_should_journal_data(inode) && - (S_ISLNK(inode->i_mode) || S_ISREG(inode->i_mode)) && - inode->i_data.nrpages) { + S_ISREG(inode->i_mode) && inode->i_data.nrpages) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; @@ -2958,8 +2957,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb) || S_ISLNK(inode->i_mode) || - ext4_verity_in_progress(inode)) { + if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -5005,7 +5003,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; - ext4_set_aops(inode); } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; @@ -5013,9 +5010,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; - ext4_set_aops(inode); } - inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index b202626391ff..7aca8944901d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3249,6 +3249,32 @@ out_trace: return retval; } +static int ext4_init_symlink_block(handle_t *handle, struct inode *inode, + struct fscrypt_str *disk_link) +{ + struct buffer_head *bh; + char *kaddr; + int err = 0; + + bh = ext4_bread(handle, inode, 0, EXT4_GET_BLOCKS_CREATE); + if (IS_ERR(bh)) + return PTR_ERR(bh); + + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); + if (err) + goto out; + + kaddr = (char *)bh->b_data; + memcpy(kaddr, disk_link->name, disk_link->len); + inode->i_size = disk_link->len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_handle_dirty_metadata(handle, inode, bh); +out: + brelse(bh); + return err; +} + static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { @@ -3257,6 +3283,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, int err, len = strlen(symname); int credits; struct fscrypt_str disk_link; + int retries = 0; if (unlikely(ext4_forced_shutdown(EXT4_SB(dir->i_sb)))) return -EIO; @@ -3270,26 +3297,15 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - /* - * For non-fast symlinks, we just allocate inode and put it on - * orphan list in the first transaction => we need bitmap, - * group descriptor, sb, inode block, quota blocks, and - * possibly selinux xattr blocks. - */ - credits = 4 + EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + - EXT4_XATTR_TRANS_BLOCKS; - } else { - /* - * Fast symlink. We have to add entry to directory - * (EXT4_DATA_TRANS_BLOCKS + EXT4_INDEX_EXTRA_TRANS_BLOCKS), - * allocate new inode (bitmap, group descriptor, inode block, - * quota blocks, sb is already counted in previous macros). - */ - credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; - } - + /* + * EXT4_INDEX_EXTRA_TRANS_BLOCKS for addition of entry into the + * directory. +3 for inode, inode bitmap, group descriptor allocation. + * EXT4_DATA_TRANS_BLOCKS for the data block allocation and + * modification. + */ + credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; +retry: inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3297,7 +3313,8 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto out_retry; } if (IS_ENCRYPTED(inode)) { @@ -3305,74 +3322,44 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto err_drop_inode; inode->i_op = &ext4_encrypted_symlink_inode_operations; + } else { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + inode->i_op = &ext4_symlink_inode_operations; + } else { + inode->i_op = &ext4_fast_symlink_inode_operations; + inode->i_link = (char *)&EXT4_I(inode)->i_data; + } } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - if (!IS_ENCRYPTED(inode)) - inode->i_op = &ext4_symlink_inode_operations; - inode_nohighmem(inode); - ext4_set_aops(inode); - /* - * We cannot call page_symlink() with transaction started - * because it calls into ext4_write_begin() which can wait - * for transaction commit if we are running out of space - * and thus we deadlock. So we have to stop transaction now - * and restart it when symlink contents is written. - * - * To keep fs consistent in case of crash, we have to put inode - * to orphan list in the mean time. - */ - drop_nlink(inode); - err = ext4_orphan_add(handle, inode); - if (handle) - ext4_journal_stop(handle); - handle = NULL; - if (err) - goto err_drop_inode; - err = __page_symlink(inode, disk_link.name, disk_link.len, 1); - if (err) - goto err_drop_inode; - /* - * Now inode is being linked into dir (EXT4_DATA_TRANS_BLOCKS - * + EXT4_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified - */ - handle = ext4_journal_start(dir, EXT4_HT_DIR, - EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - handle = NULL; - goto err_drop_inode; - } - set_nlink(inode, 1); - err = ext4_orphan_del(handle, inode); + /* alloc symlink block and fill it */ + err = ext4_init_symlink_block(handle, inode, &disk_link); if (err) goto err_drop_inode; } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - if (!IS_ENCRYPTED(inode)) { - inode->i_op = &ext4_fast_symlink_inode_operations; - inode->i_link = (char *)&EXT4_I(inode)->i_data; - } memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; + EXT4_I(inode)->i_disksize = inode->i_size; } - EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, &inode); if (handle) ext4_journal_stop(handle); iput(inode); - goto out_free_encrypted_link; + goto out_retry; err_drop_inode: - if (handle) - ext4_journal_stop(handle); clear_nlink(inode); + ext4_orphan_add(handle, inode); unlock_new_inode(inode); + if (handle) + ext4_journal_stop(handle); iput(inode); -out_free_encrypted_link: +out_retry: + if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) + goto retry; if (disk_link.name != (unsigned char *)symname) kfree(disk_link.name); return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 69109746e6e2..d281f5bcc526 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -27,7 +27,7 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *cpage = NULL; + struct buffer_head *bh = NULL; const void *caddr; unsigned int max_size; const char *paddr; @@ -39,16 +39,19 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, caddr = EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { - cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) - return ERR_CAST(cpage); - caddr = page_address(cpage); + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + caddr = bh->b_data; max_size = inode->i_sb->s_blocksize; } paddr = fscrypt_get_symlink(inode, caddr, max_size, done); - if (cpage) - put_page(cpage); + brelse(bh); return paddr; } @@ -62,6 +65,38 @@ static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, return fscrypt_symlink_getattr(path, stat); } +static void ext4_free_link(void *bh) +{ + brelse(bh); +} + +static const char *ext4_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + struct buffer_head *bh; + + if (!dentry) { + bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh || !ext4_buffer_uptodate(bh)) + return ERR_PTR(-ECHILD); + } else { + bh = ext4_bread(NULL, inode, 0, 0); + if (IS_ERR(bh)) + return ERR_CAST(bh); + if (!bh) { + EXT4_ERROR_INODE(inode, "bad symlink."); + return ERR_PTR(-EFSCORRUPTED); + } + } + + set_delayed_call(callback, ext4_free_link, bh); + nd_terminate_link(bh->b_data, inode->i_size, + inode->i_sb->s_blocksize - 1); + return bh->b_data; +} + const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, @@ -70,7 +105,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { }; const struct inode_operations ext4_symlink_inode_operations = { - .get_link = page_get_link, + .get_link = ext4_get_link, .setattr = ext4_setattr, .getattr = ext4_getattr, .listxattr = ext4_listxattr, -- cgit v1.2.3 From f87c7a4b084afc13190cbb263538e444cb2b392a Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Thu, 28 Apr 2022 21:40:31 +0800 Subject: ext4: fix race condition between ext4_write and ext4_convert_inline_data Hulk Robot reported a BUG_ON: ================================================================== EXT4-fs error (device loop3): ext4_mb_generate_buddy:805: group 0, block bitmap and bg descriptor inconsistent: 25 vs 31513 free clusters kernel BUG at fs/ext4/ext4_jbd2.c:53! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 0 PID: 25371 Comm: syz-executor.3 Not tainted 5.10.0+ #1 RIP: 0010:ext4_put_nojournal fs/ext4/ext4_jbd2.c:53 [inline] RIP: 0010:__ext4_journal_stop+0x10e/0x110 fs/ext4/ext4_jbd2.c:116 [...] Call Trace: ext4_write_inline_data_end+0x59a/0x730 fs/ext4/inline.c:795 generic_perform_write+0x279/0x3c0 mm/filemap.c:3344 ext4_buffered_write_iter+0x2e3/0x3d0 fs/ext4/file.c:270 ext4_file_write_iter+0x30a/0x11c0 fs/ext4/file.c:520 do_iter_readv_writev+0x339/0x3c0 fs/read_write.c:732 do_iter_write+0x107/0x430 fs/read_write.c:861 vfs_writev fs/read_write.c:934 [inline] do_pwritev+0x1e5/0x380 fs/read_write.c:1031 [...] ================================================================== Above issue may happen as follows: cpu1 cpu2 __________________________|__________________________ do_pwritev vfs_writev do_iter_write ext4_file_write_iter ext4_buffered_write_iter generic_perform_write ext4_da_write_begin vfs_fallocate ext4_fallocate ext4_convert_inline_data ext4_convert_inline_data_nolock ext4_destroy_inline_data_nolock clear EXT4_STATE_MAY_INLINE_DATA ext4_map_blocks ext4_ext_map_blocks ext4_mb_new_blocks ext4_mb_regular_allocator ext4_mb_good_group_nolock ext4_mb_init_group ext4_mb_init_cache ext4_mb_generate_buddy --> error ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ext4_restore_inline_data set EXT4_STATE_MAY_INLINE_DATA ext4_block_write_begin ext4_da_write_end ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ext4_write_inline_data_end handle=NULL ext4_journal_stop(handle) __ext4_journal_stop ext4_put_nojournal(handle) ref_cnt = (unsigned long)handle BUG_ON(ref_cnt == 0) ---> BUG_ON The lock held by ext4_convert_inline_data is xattr_sem, but the lock held by generic_perform_write is i_rwsem. Therefore, the two locks can be concurrent. To solve above issue, we add inode_lock() for ext4_convert_inline_data(). At the same time, move ext4_convert_inline_data() in front of ext4_punch_hole(), remove similar handling from ext4_punch_hole(). Fixes: 0c8d414f163f ("ext4: let fallocate handle inline data correctly") Cc: stable@vger.kernel.org Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220428134031.4153381-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 ++++++---- fs/ext4/inode.c | 9 --------- 2 files changed, 6 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e473fde6b64b..474479ce76e0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4693,15 +4693,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; + inode_lock(inode); + ret = ext4_convert_inline_data(inode); + inode_unlock(inode); + if (ret) + goto exit; + if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(file, offset, len); goto exit; } - ret = ext4_convert_inline_data(inode); - if (ret) - goto exit; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(file, offset, len); goto exit; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5948bbba28e3..890f769d6e20 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3979,15 +3979,6 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - if (ext4_has_inline_data(inode)) { - filemap_invalidate_lock(mapping); - ret = ext4_convert_inline_data(inode); - filemap_invalidate_unlock(mapping); - if (ret) - return ret; - } - /* * Write out all dirty pages to avoid race conditions * Then release them. -- cgit v1.2.3 From b10b6278ae17366fec058219623c757b3302baae Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 5 May 2022 06:50:25 +0800 Subject: ext4: remove duplicated #include of dax.h in inode.c Fix following includecheck warning: ./fs/ext4/inode.c: linux/dax.h is included more than once. Reported-by: Abaci Robot Signed-off-by: Yang Li Link: https://lore.kernel.org/r/20220504225025.44753-1-yang.lee@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 890f769d6e20..7555cbe77148 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,7 +41,6 @@ #include #include #include -#include #include "ext4_jbd2.h" #include "xattr.h" -- cgit v1.2.3 From cb8435dc8ba33bcafa41cf2aa253794320a3b8df Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 10 May 2022 11:32:32 -0700 Subject: ext4: reject the 'commit' option on ext2 filesystems The 'commit' option is only applicable for ext3 and ext4 filesystems, and has never been accepted by the ext2 filesystem driver, so the ext4 driver shouldn't allow it on ext2 filesystems. This fixes a failure in xfstest ext4/053. Fixes: 8dc0aa8cf0f7 ("ext4: check incompatible mount options while mounting ext2/3") Signed-off-by: Eric Biggers Reviewed-by: Ritesh Harjani Reviewed-by: Lukas Czerner Link: https://lore.kernel.org/r/20220510183232.172615-1-ebiggers@kernel.org --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4b0ea8df1f5c..3f59efd3aa3e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1915,6 +1915,7 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET}, {Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR}, + {Opt_commit, 0, MOPT_NO_EXT2}, {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, -- cgit v1.2.3 From c069db76ed7b681c69159f44be96d2137e9ca989 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 13 May 2022 16:16:01 -0700 Subject: ext4: fix memory leak in parse_apply_sb_mount_options() If processing the on-disk mount options fails after any memory was allocated in the ext4_fs_context, e.g. s_qf_names, then this memory is leaked. Fix this by calling ext4_fc_free() instead of kfree() directly. Reproducer: mkfs.ext4 -F /dev/vdc tune2fs /dev/vdc -E mount_opts=usrjquota=file echo clear > /sys/kernel/debug/kmemleak mount /dev/vdc /vdc echo scan > /sys/kernel/debug/kmemleak sleep 5 echo scan > /sys/kernel/debug/kmemleak cat /sys/kernel/debug/kmemleak Fixes: 7edfd85b1ffd ("ext4: Completely separate options parsing and sb setup") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Tested-by: Ritesh Harjani Link: https://lore.kernel.org/r/20220513231605.175121-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f59efd3aa3e..ea8255a03305 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2628,8 +2628,10 @@ parse_failed: ret = ext4_apply_options(fc, sb); out_free: - kfree(s_ctx); - kfree(fc); + if (fc) { + ext4_fc_free(fc); + kfree(fc); + } kfree(s_mount_opts); return ret; } -- cgit v1.2.3 From b1241c8eb977826cf9b1588514ab8896aa23896b Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 15 May 2022 12:07:46 +0530 Subject: ext4: move ext4 crypto code to its own file crypto.c This is to cleanup super.c file which has grown quite large. So, start moving ext4 crypto related code to where it should be in the first place i.e. fs/ext4/crypto.c Reviewed-by: Eric Biggers Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/7d637e093cbc34d727397e8d41a53a1b9ca7d7a4.1652595565.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/Makefile | 1 + fs/ext4/crypto.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 3 ++ fs/ext4/super.c | 122 ---------------------------------------------------- 4 files changed, 131 insertions(+), 122 deletions(-) create mode 100644 fs/ext4/crypto.c (limited to 'fs') diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 7d89142e1421..72206a292676 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -17,3 +17,4 @@ ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o ext4-inode-test-objs += inode-test.o obj-$(CONFIG_EXT4_KUNIT_TESTS) += ext4-inode-test.o ext4-$(CONFIG_FS_VERITY) += verity.o +ext4-$(CONFIG_FS_ENCRYPTION) += crypto.o diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c new file mode 100644 index 000000000000..e5413c0970ee --- /dev/null +++ b/fs/ext4/crypto.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "ext4.h" +#include "xattr.h" +#include "ext4_jbd2.h" + +static int ext4_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); +} + +static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + handle_t *handle = fs_data; + int res, res2, credits, retries = 0; + + /* + * Encrypting the root directory is not allowed because e2fsck expects + * lost+found to exist and be unencrypted, and encrypting the root + * directory would imply encrypting the lost+found directory as well as + * the filename "lost+found" itself. + */ + if (inode->i_ino == EXT4_ROOT_INO) + return -EPERM; + + if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) + return -EINVAL; + + if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) + return -EOPNOTSUPP; + + res = ext4_convert_inline_data(inode); + if (res) + return res; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ + + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(inode, + EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + } + return res; + } + + res = dquot_initialize(inode); + if (res) + return res; +retry: + res = ext4_xattr_set_credits(inode, len, false /* is_create */, + &credits); + if (res) + return res; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); + if (!res) { + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* + * Update inode->i_flags - S_ENCRYPTED will be enabled, + * S_DAX may be disabled + */ + ext4_set_inode_flags(inode, false); + res = ext4_mark_inode_dirty(handle, inode); + if (res) + EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); + } + res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; + if (!res) + res = res2; + return res; +} + +static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) +{ + return EXT4_SB(sb)->s_dummy_enc_policy.policy; +} + +static bool ext4_has_stable_inodes(struct super_block *sb) +{ + return ext4_has_feature_stable_inodes(sb); +} + +static void ext4_get_ino_and_lblk_bits(struct super_block *sb, + int *ino_bits_ret, int *lblk_bits_ret) +{ + *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); + *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); +} + +const struct fscrypt_operations ext4_cryptops = { + .key_prefix = "ext4:", + .get_context = ext4_get_context, + .set_context = ext4_set_context, + .get_dummy_policy = ext4_get_dummy_policy, + .empty_dir = ext4_empty_dir, + .has_stable_inodes = ext4_has_stable_inodes, + .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, +}; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 797bc572d6fb..6928e281c2cb 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2733,7 +2733,10 @@ extern int ext4_fname_setup_ci_filename(struct inode *dir, struct ext4_filename *fname); #endif +/* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION +extern const struct fscrypt_operations ext4_cryptops; + static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, const struct fscrypt_name *src) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ea8255a03305..7f6cd2473163 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1495,128 +1495,6 @@ static int ext4_nfs_commit_metadata(struct inode *inode) return ext4_write_inode(inode, &wbc); } -#ifdef CONFIG_FS_ENCRYPTION -static int ext4_get_context(struct inode *inode, void *ctx, size_t len) -{ - return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, len); -} - -static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, - void *fs_data) -{ - handle_t *handle = fs_data; - int res, res2, credits, retries = 0; - - /* - * Encrypting the root directory is not allowed because e2fsck expects - * lost+found to exist and be unencrypted, and encrypting the root - * directory would imply encrypting the lost+found directory as well as - * the filename "lost+found" itself. - */ - if (inode->i_ino == EXT4_ROOT_INO) - return -EPERM; - - if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode))) - return -EINVAL; - - if (ext4_test_inode_flag(inode, EXT4_INODE_DAX)) - return -EOPNOTSUPP; - - res = ext4_convert_inline_data(inode); - if (res) - return res; - - /* - * If a journal handle was specified, then the encryption context is - * being set on a new inode via inheritance and is part of a larger - * transaction to create the inode. Otherwise the encryption context is - * being set on an existing inode in its own transaction. Only in the - * latter case should the "retry on ENOSPC" logic be used. - */ - - if (handle) { - res = ext4_xattr_set_handle(handle, inode, - EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - /* - * Update inode->i_flags - S_ENCRYPTED will be enabled, - * S_DAX may be disabled - */ - ext4_set_inode_flags(inode, false); - } - return res; - } - - res = dquot_initialize(inode); - if (res) - return res; -retry: - res = ext4_xattr_set_credits(inode, len, false /* is_create */, - &credits); - if (res) - return res; - - handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); - - res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - ctx, len, 0); - if (!res) { - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - /* - * Update inode->i_flags - S_ENCRYPTED will be enabled, - * S_DAX may be disabled - */ - ext4_set_inode_flags(inode, false); - res = ext4_mark_inode_dirty(handle, inode); - if (res) - EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); - } - res2 = ext4_journal_stop(handle); - - if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - if (!res) - res = res2; - return res; -} - -static const union fscrypt_policy *ext4_get_dummy_policy(struct super_block *sb) -{ - return EXT4_SB(sb)->s_dummy_enc_policy.policy; -} - -static bool ext4_has_stable_inodes(struct super_block *sb) -{ - return ext4_has_feature_stable_inodes(sb); -} - -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - -static const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", - .get_context = ext4_get_context, - .set_context = ext4_set_context, - .get_dummy_policy = ext4_get_dummy_policy, - .empty_dir = ext4_empty_dir, - .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, -}; -#endif - #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -- cgit v1.2.3 From 3030b59c8533835f4e8b5e6d7047f80dcf7e40b9 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 15 May 2022 12:07:47 +0530 Subject: ext4: cleanup function defs from ext4.h into crypto.c Some of these functions when CONFIG_FS_ENCRYPTION is enabled are not really inline (let compiler be the best judge of it). Remove inline and move them into crypto.c where they should be present. Reviewed-by: Eric Biggers Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/b7b9de2c7226298663fb5a0c28909135e2ab220f.1652595565.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/crypto.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 69 ++++---------------------------------------------------- 2 files changed, 70 insertions(+), 64 deletions(-) (limited to 'fs') diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index e5413c0970ee..f8333927f0f6 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -6,6 +6,71 @@ #include "xattr.h" #include "ext4_jbd2.h" +static void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, + const struct fscrypt_name *src) +{ + memset(dst, 0, sizeof(*dst)); + + dst->usr_fname = src->usr_fname; + dst->disk_name = src->disk_name; + dst->hinfo.hash = src->hash; + dst->hinfo.minor_hash = src->minor_hash; + dst->crypto_buf = src->crypto_buf; +} + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_setup_filename(dir, iname, lookup, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, iname, fname); +#endif + return err; +} + +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname) +{ + struct fscrypt_name name; + int err; + + err = fscrypt_prepare_lookup(dir, dentry, &name); + if (err) + return err; + + ext4_fname_from_fscrypt_name(fname, &name); + +#if IS_ENABLED(CONFIG_UNICODE) + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); +#endif + return err; +} + +void ext4_fname_free_filename(struct ext4_filename *fname) +{ + struct fscrypt_name name; + + name.crypto_buf = fname->crypto_buf; + fscrypt_free_filename(&name); + + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; + +#if IS_ENABLED(CONFIG_UNICODE) + kfree(fname->cf_name.name); + fname->cf_name.name = NULL; +#endif +} + static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6928e281c2cb..d815d07a8c3d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2737,73 +2737,14 @@ extern int ext4_fname_setup_ci_filename(struct inode *dir, #ifdef CONFIG_FS_ENCRYPTION extern const struct fscrypt_operations ext4_cryptops; -static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst, - const struct fscrypt_name *src) -{ - memset(dst, 0, sizeof(*dst)); - - dst->usr_fname = src->usr_fname; - dst->disk_name = src->disk_name; - dst->hinfo.hash = src->hash; - dst->hinfo.minor_hash = src->minor_hash; - dst->crypto_buf = src->crypto_buf; -} - -static inline int ext4_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, - struct ext4_filename *fname) -{ - struct fscrypt_name name; - int err; +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); - err = fscrypt_setup_filename(dir, iname, lookup, &name); - if (err) - return err; +int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, + struct ext4_filename *fname); - ext4_fname_from_fscrypt_name(fname, &name); +void ext4_fname_free_filename(struct ext4_filename *fname); -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, iname, fname); -#endif - return err; -} - -static inline int ext4_fname_prepare_lookup(struct inode *dir, - struct dentry *dentry, - struct ext4_filename *fname) -{ - struct fscrypt_name name; - int err; - - err = fscrypt_prepare_lookup(dir, dentry, &name); - if (err) - return err; - - ext4_fname_from_fscrypt_name(fname, &name); - -#if IS_ENABLED(CONFIG_UNICODE) - err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); -#endif - return err; -} - -static inline void ext4_fname_free_filename(struct ext4_filename *fname) -{ - struct fscrypt_name name; - - name.crypto_buf = fname->crypto_buf; - fscrypt_free_filename(&name); - - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; - -#if IS_ENABLED(CONFIG_UNICODE) - kfree(fname->cf_name.name); - fname->cf_name.name = NULL; -#endif -} #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, -- cgit v1.2.3 From 72f63f4a770310233dba3bff8fc6e41660ebaa27 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Sun, 15 May 2022 12:07:48 +0530 Subject: ext4: refactor and move ext4_ioctl_get_encryption_pwsalt() This patch move code for FS_IOC_GET_ENCRYPTION_PWSALT case into ext4's crypto.c file, i.e. ext4_ioctl_get_encryption_pwsalt() and uuid_is_zero(). This is mostly refactoring logic and should not affect any functionality change. Suggested-by: Eric Biggers Reviewed-by: Eric Biggers Signed-off-by: Ritesh Harjani Link: https://lore.kernel.org/r/5af98b17152a96b245b4f7d2dfb8607fc93e36aa.1652595565.git.ritesh.list@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/crypto.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 8 ++++++++ fs/ext4/ioctl.c | 59 ++------------------------------------------------------ 3 files changed, 64 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index f8333927f0f6..e20ac0654b3f 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "ext4.h" #include "xattr.h" @@ -71,6 +72,59 @@ void ext4_fname_free_filename(struct ext4_filename *fname) #endif } +static bool uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return false; + return true; +} + +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) +{ + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + int err, err2; + handle_t *handle; + + if (!ext4_has_feature_encrypt(sb)) + return -EOPNOTSUPP; + + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto pwsalt_err_journal; + lock_buffer(sbi->s_sbh); + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); +pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; +pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + + if (copy_to_user(arg, sbi->s_es->s_encrypt_pw_salt, 16)) + return -EFAULT; + return 0; +} + static int ext4_get_context(struct inode *inode, void *ctx, size_t len) { return ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d815d07a8c3d..a16d77b29277 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2745,6 +2745,8 @@ int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, void ext4_fname_free_filename(struct ext4_filename *fname); +int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); + #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, @@ -2777,6 +2779,12 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) fname->cf_name.name = NULL; #endif } + +static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, + void __user *arg) +{ + return -EOPNOTSUPP; +} #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index ba44fa1be70a..d8639aaed3f6 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -504,18 +503,6 @@ journal_err_out: return err; } -#ifdef CONFIG_FS_ENCRYPTION -static int uuid_is_zero(__u8 u[16]) -{ - int i; - - for (i = 0; i < 16; i++) - if (u[i]) - return 0; - return 1; -} -#endif - /* * If immutable is set and we are not clearing it, we're not allowed to change * anything else in the inode. Don't error out if we're only trying to set @@ -1432,51 +1419,9 @@ resizefs_out: return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - case FS_IOC_GET_ENCRYPTION_PWSALT: { -#ifdef CONFIG_FS_ENCRYPTION - int err, err2; - struct ext4_sb_info *sbi = EXT4_SB(sb); - handle_t *handle; + case FS_IOC_GET_ENCRYPTION_PWSALT: + return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); - if (!ext4_has_feature_encrypt(sb)) - return -EOPNOTSUPP; - if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { - err = mnt_want_write_file(filp); - if (err) - return err; - handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto pwsalt_err_exit; - } - err = ext4_journal_get_write_access(handle, sb, - sbi->s_sbh, - EXT4_JTR_NONE); - if (err) - goto pwsalt_err_journal; - lock_buffer(sbi->s_sbh); - generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); - err = ext4_handle_dirty_metadata(handle, NULL, - sbi->s_sbh); - pwsalt_err_journal: - err2 = ext4_journal_stop(handle); - if (err2 && !err) - err = err2; - pwsalt_err_exit: - mnt_drop_write_file(filp); - if (err) - return err; - } - if (copy_to_user((void __user *) arg, - sbi->s_es->s_encrypt_pw_salt, 16)) - return -EFAULT; - return 0; -#else - return -EOPNOTSUPP; -#endif - } case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; -- cgit v1.2.3 From ef09ed5d37b84d18562b30cf7253e57062d0db05 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 16 May 2022 20:26:34 +0800 Subject: ext4: fix bug_on in ext4_writepages we got issue as follows: EXT4-fs error (device loop0): ext4_mb_generate_buddy:1141: group 0, block bitmap and bg descriptor inconsistent: 25 vs 31513 free cls ------------[ cut here ]------------ kernel BUG at fs/ext4/inode.c:2708! invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 2 PID: 2147 Comm: rep Not tainted 5.18.0-rc2-next-20220413+ #155 RIP: 0010:ext4_writepages+0x1977/0x1c10 RSP: 0018:ffff88811d3e7880 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88811c098000 RDX: 0000000000000000 RSI: ffff88811c098000 RDI: 0000000000000002 RBP: ffff888128140f50 R08: ffffffffb1ff6387 R09: 0000000000000000 R10: 0000000000000007 R11: ffffed10250281ea R12: 0000000000000001 R13: 00000000000000a4 R14: ffff88811d3e7bb8 R15: ffff888128141028 FS: 00007f443aed9740(0000) GS:ffff8883aef00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020007200 CR3: 000000011c2a4000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_writepages+0x130/0x3a0 filemap_fdatawrite_wbc+0x83/0xa0 filemap_flush+0xab/0xe0 ext4_alloc_da_blocks+0x51/0x120 __ext4_ioctl+0x1534/0x3210 __x64_sys_ioctl+0x12c/0x170 do_syscall_64+0x3b/0x90 It may happen as follows: 1. write inline_data inode vfs_write new_sync_write ext4_file_write_iter ext4_buffered_write_iter generic_perform_write ext4_da_write_begin ext4_da_write_inline_data_begin -> If inline data size too small will allocate block to write, then mapping will has dirty page ext4_da_convert_inline_data_to_extent ->clear EXT4_STATE_MAY_INLINE_DATA 2. fallocate do_vfs_ioctl ioctl_preallocate vfs_fallocate ext4_fallocate ext4_convert_inline_data ext4_convert_inline_data_nolock ext4_map_blocks -> fail will goto restore data ext4_restore_inline_data ext4_create_inline_data ext4_write_inline_data ext4_set_inode_state -> set inode EXT4_STATE_MAY_INLINE_DATA 3. writepages __ext4_ioctl ext4_alloc_da_blocks filemap_flush filemap_fdatawrite_wbc do_writepages ext4_writepages if (ext4_has_inline_data(inode)) BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) The root cause of this issue is we destory inline data until call ext4_writepages under delay allocation mode. But there maybe already convert from inline to extent. To solve this issue, we call filemap_flush first.. Cc: stable@kernel.org Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220516122634.1690462-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 13a63e3639b1..513762c087a9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -2005,6 +2005,18 @@ int ext4_convert_inline_data(struct inode *inode) if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; + } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + /* + * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is + * cleared. This means we are in the middle of moving of + * inline data to delay allocated block. Just force writeout + * here to finish conversion. + */ + error = filemap_flush(inode->i_mapping); + if (error) + return error; + if (!ext4_has_inline_data(inode)) + return 0; } needed_blocks = ext4_writepage_trans_blocks(inode); -- cgit v1.2.3 From c878bea3c9d724ddfa05a813f30de3d25a0ba83f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 17 May 2022 13:27:55 -0400 Subject: ext4: filter out EXT4_FC_REPLAY from on-disk superblock field s_state The EXT4_FC_REPLAY bit in sbi->s_mount_state is used to indicate that we are in the middle of replay the fast commit journal. This was actually a mistake, since the sbi->s_mount_info is initialized from es->s_state. Arguably s_mount_state is misleadingly named, but the name is historical --- s_mount_state and s_state dates back to ext2. What should have been used is the ext4_{set,clear,test}_mount_flag() inline functions, which sets EXT4_MF_* bits in sbi->s_mount_flags. The problem with using EXT4_FC_REPLAY is that a maliciously corrupted superblock could result in EXT4_FC_REPLAY getting set in s_mount_state. This bypasses some sanity checks, and this can trigger a BUG() in ext4_es_cache_extent(). As a easy-to-backport-fix, filter out the EXT4_FC_REPLAY bit for now. We should eventually transition away from EXT4_FC_REPLAY to something like EXT4_MF_REPLAY. Cc: stable@kernel.org Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20220420192312.1655305-1-phind.uet@gmail.com Link: https://lore.kernel.org/r/20220517174028.942119-1-tytso@mit.edu Reported-by: syzbot+c7358a3cd05ee786eb31@syzkaller.appspotmail.com --- fs/ext4/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f6cd2473163..9cbb22045379 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4770,7 +4770,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); sbi->s_sbh = bh; - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); @@ -6333,7 +6333,8 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) if (err) goto restore_opts; } - sbi->s_mount_state = le16_to_cpu(es->s_state); + sbi->s_mount_state = (le16_to_cpu(es->s_state) & + ~EXT4_FC_REPLAY); err = ext4_setup_super(sb, es, 0); if (err) -- cgit v1.2.3 From 46c116b920ebec58031f0a78c5ea9599b0d2a371 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 18 May 2022 11:33:28 +0200 Subject: ext4: verify dir block before splitting it Before splitting a directory block verify its directory entries are sane so that the splitting code does not access memory it should not. Cc: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220518093332.13986-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7aca8944901d..7286472e9558 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -277,9 +277,9 @@ static struct dx_frame *dx_probe(struct ext4_filename *fname, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, - unsigned blocksize, struct dx_hash_info *hinfo, - struct dx_map_entry map[]); +static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, + struct dx_map_entry *map_tail); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, char *to, struct dx_map_entry *offsets, @@ -1249,15 +1249,23 @@ static inline int search_dirblock(struct buffer_head *bh, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, - unsigned blocksize, struct dx_hash_info *hinfo, +static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; - char *base = (char *) de; + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data; + unsigned int buflen = bh->b_size; + char *base = bh->b_data; struct dx_hash_info h = *hinfo; - while ((char *) de < base + blocksize) { + if (ext4_has_metadata_csum(dir->i_sb)) + buflen -= sizeof(struct ext4_dir_entry_tail); + + while ((char *) de < base + buflen) { + if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen, + ((char *)de) - base)) + return -EFSCORRUPTED; if (de->name_len && de->inode) { if (ext4_hash_in_dirent(dir)) h.hash = EXT4_DIRENT_HASH(de); @@ -1270,8 +1278,7 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, count++; cond_resched(); } - /* XXX: do we need to check rec_len == 0 case? -Chris */ - de = ext4_next_entry(de, blocksize); + de = ext4_next_entry(de, dir->i_sb->s_blocksize); } return count; } @@ -1943,8 +1950,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, - blocksize, hinfo, map); + count = dx_make_map(dir, *bh, hinfo, map); + if (count < 0) { + err = count; + goto journal_error; + } map -= count; dx_sort_map(map, count); /* Ensure that neither split block is over half full */ -- cgit v1.2.3 From 3ba733f879c2a88910744647e41edeefbc0d92b2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 18 May 2022 11:33:29 +0200 Subject: ext4: avoid cycles in directory h-tree A maliciously corrupted filesystem can contain cycles in the h-tree stored inside a directory. That can easily lead to the kernel corrupting tree nodes that were already verified under its hands while doing a node split and consequently accessing unallocated memory. Fix the problem by verifying traversed block numbers are unique. Cc: stable@vger.kernel.org Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20220518093332.13986-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7286472e9558..47d0ca4c795b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -777,12 +777,14 @@ static struct dx_frame * dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { - unsigned count, indirect; + unsigned count, indirect, level, i; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; struct dx_frame *frame = frame_in; struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; + ext4_lblk_t block; + ext4_lblk_t blocks[EXT4_HTREE_LEVEL]; memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0])); frame->bh = ext4_read_dirblock(dir, 0, INDEX); @@ -854,6 +856,8 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, } dxtrace(printk("Look up %x", hash)); + level = 0; + blocks[0] = 0; while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { @@ -882,15 +886,27 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, dx_get_block(at))); frame->entries = entries; frame->at = at; - if (!indirect--) + + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { + ext4_warning_inode(dir, + "dx entry: tree cycle block %u points back to block %u", + blocks[level], block); + goto fail; + } + } + if (++level > indirect) return frame; + blocks[level] = block; frame++; - frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + frame->bh = ext4_read_dirblock(dir, block, INDEX); if (IS_ERR(frame->bh)) { ret_err = (struct dx_frame *) frame->bh; frame->bh = NULL; goto fail; } + entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit(dir)) { -- cgit v1.2.3 From d36f6ed761b53933b0b4126486c10d3da7751e7f Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 18 May 2022 20:08:16 +0800 Subject: ext4: fix bug_on in __es_tree_search Hulk Robot reported a BUG_ON: ================================================================== kernel BUG at fs/ext4/extents_status.c:199! [...] RIP: 0010:ext4_es_end fs/ext4/extents_status.c:199 [inline] RIP: 0010:__es_tree_search+0x1e0/0x260 fs/ext4/extents_status.c:217 [...] Call Trace: ext4_es_cache_extent+0x109/0x340 fs/ext4/extents_status.c:766 ext4_cache_extents+0x239/0x2e0 fs/ext4/extents.c:561 ext4_find_extent+0x6b7/0xa20 fs/ext4/extents.c:964 ext4_ext_map_blocks+0x16b/0x4b70 fs/ext4/extents.c:4384 ext4_map_blocks+0xe26/0x19f0 fs/ext4/inode.c:567 ext4_getblk+0x320/0x4c0 fs/ext4/inode.c:980 ext4_bread+0x2d/0x170 fs/ext4/inode.c:1031 ext4_quota_read+0x248/0x320 fs/ext4/super.c:6257 v2_read_header+0x78/0x110 fs/quota/quota_v2.c:63 v2_check_quota_file+0x76/0x230 fs/quota/quota_v2.c:82 vfs_load_quota_inode+0x5d1/0x1530 fs/quota/dquot.c:2368 dquot_enable+0x28a/0x330 fs/quota/dquot.c:2490 ext4_quota_enable fs/ext4/super.c:6137 [inline] ext4_enable_quotas+0x5d7/0x960 fs/ext4/super.c:6163 ext4_fill_super+0xa7c9/0xdc00 fs/ext4/super.c:4754 mount_bdev+0x2e9/0x3b0 fs/super.c:1158 mount_fs+0x4b/0x1e4 fs/super.c:1261 [...] ================================================================== Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_enable_quotas ext4_quota_enable ext4_iget __ext4_iget ext4_ext_check_inode ext4_ext_check __ext4_ext_check ext4_valid_extent_entries Check for overlapping extents does't take effect dquot_enable vfs_load_quota_inode v2_check_quota_file v2_read_header ext4_quota_read ext4_bread ext4_getblk ext4_map_blocks ext4_ext_map_blocks ext4_find_extent ext4_cache_extents ext4_es_cache_extent ext4_es_cache_extent __es_tree_search ext4_es_end BUG_ON(es->es_lblk + es->es_len < es->es_lblk) The error ext4 extents is as follows: 0af3 0300 0400 0000 00000000 extent_header 00000000 0100 0000 12000000 extent1 00000000 0100 0000 18000000 extent2 02000000 0400 0000 14000000 extent3 In the ext4_valid_extent_entries function, if prev is 0, no error is returned even if lblock<=prev. This was intended to skip the check on the first extent, but in the error image above, prev=0+1-1=0 when checking the second extent, so even though lblock<=prev, the function does not return an error. As a result, bug_ON occurs in __es_tree_search and the system panics. To solve this problem, we only need to check that: 1. The lblock of the first extent is not less than 0. 2. The lblock of the next extent is not less than the next block of the previous extent. The same applies to extent_idx. Cc: stable@kernel.org Fixes: 5946d089379a ("ext4: check for overlapping extents in ext4_valid_extent_entries()") Reported-by: Hulk Robot Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20220518120816.1541863-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 474479ce76e0..c148bb97b527 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -372,7 +372,7 @@ static int ext4_valid_extent_entries(struct inode *inode, { unsigned short entries; ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; + ext4_lblk_t cur = 0; if (eh->eh_entries == 0) return 1; @@ -396,11 +396,11 @@ static int ext4_valid_extent_entries(struct inode *inode, /* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } - prev = lblock + ext4_ext_get_actual_len(ext) - 1; + cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } @@ -420,13 +420,13 @@ static int ext4_valid_extent_entries(struct inode *inode, /* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; - prev = lblock; + cur = lblock + 1; } } return 1; -- cgit v1.2.3 From 5f41fdaea63ddf96d921ab36b2af4a90ccdb5744 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 19 May 2022 13:44:37 -0700 Subject: ext4: only allow test_dummy_encryption when supported Make the test_dummy_encryption mount option require that the encrypt feature flag be already enabled on the filesystem, rather than automatically enabling it. Practically, this means that "-O encrypt" will need to be included in MKFS_OPTIONS when running xfstests with the test_dummy_encryption mount option. (ext4/053 also needs an update.) Moreover, as long as the preconditions for test_dummy_encryption are being tightened anyway, take the opportunity to start rejecting it when !CONFIG_FS_ENCRYPTION rather than ignoring it. The motivation for requiring the encrypt feature flag is that: - Having the filesystem auto-enable feature flags is problematic, as it bypasses the usual sanity checks. The specific issue which came up recently is that in kernel versions where ext4 supports casefold but not encrypt+casefold (v5.1 through v5.10), the kernel will happily add the encrypt flag to a filesystem that has the casefold flag, making it unmountable -- but only for subsequent mounts, not the initial one. This confused the casefold support detection in xfstests, causing generic/556 to fail rather than be skipped. - The xfstests-bld test runners (kvm-xfstests et al.) already use the required mkfs flag, so they will not be affected by this change. Only users of test_dummy_encryption alone will be affected. But, this option has always been for testing only, so it should be fine to require that the few users of this option update their test scripts. - f2fs already requires it (for its equivalent feature flag). Signed-off-by: Eric Biggers Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20220519204437.61645-1-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 6 ------ fs/ext4/super.c | 60 ++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 38 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a16d77b29277..d5cea9c2e2a2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1442,12 +1442,6 @@ struct ext4_super_block { #ifdef __KERNEL__ -#ifdef CONFIG_FS_ENCRYPTION -#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) -#else -#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) -#endif - /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9cbb22045379..8ff4c6545a49 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2308,11 +2308,12 @@ static int ext4_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx->spec |= EXT4_SPEC_DUMMY_ENCRYPTION; ctx->test_dummy_enc_arg = kmemdup_nul(param->string, param->size, GFP_KERNEL); + return 0; #else ext4_msg(NULL, KERN_WARNING, - "Test dummy encryption mount option ignored"); + "test_dummy_encryption option not supported"); + return -EINVAL; #endif - return 0; case Opt_dax: case Opt_dax_type: #ifdef CONFIG_FS_DAX @@ -2669,12 +2670,44 @@ err_jquota_specified: #endif } +static int ext4_check_test_dummy_encryption(const struct fs_context *fc, + struct super_block *sb) +{ +#ifdef CONFIG_FS_ENCRYPTION + const struct ext4_fs_context *ctx = fc->fs_private; + const struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!(ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION)) + return 0; + + if (!ext4_has_feature_encrypt(sb)) { + ext4_msg(NULL, KERN_WARNING, + "test_dummy_encryption requires encrypt feature"); + return -EINVAL; + } + /* + * This mount option is just for testing, and it's not worthwhile to + * implement the extra complexity (e.g. RCU protection) that would be + * needed to allow it to be set or changed during remount. We do allow + * it to be specified during remount, but only if there is no change. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && + !sbi->s_dummy_enc_policy.policy) { + ext4_msg(NULL, KERN_WARNING, + "Can't set test_dummy_encryption on remount"); + return -EINVAL; + } +#endif /* CONFIG_FS_ENCRYPTION */ + return 0; +} + static int ext4_check_opt_consistency(struct fs_context *fc, struct super_block *sb) { struct ext4_fs_context *ctx = fc->fs_private; struct ext4_sb_info *sbi = fc->s_fs_info; int is_remount = fc->purpose == FS_CONTEXT_FOR_RECONFIGURE; + int err; if ((ctx->opt_flags & MOPT_NO_EXT2) && IS_EXT2_SB(sb)) { ext4_msg(NULL, KERN_ERR, @@ -2704,20 +2737,9 @@ static int ext4_check_opt_consistency(struct fs_context *fc, "for blocksize < PAGE_SIZE"); } -#ifdef CONFIG_FS_ENCRYPTION - /* - * This mount option is just for testing, and it's not worthwhile to - * implement the extra complexity (e.g. RCU protection) that would be - * needed to allow it to be set or changed during remount. We do allow - * it to be specified during remount, but only if there is no change. - */ - if ((ctx->spec & EXT4_SPEC_DUMMY_ENCRYPTION) && - is_remount && !sbi->s_dummy_enc_policy.policy) { - ext4_msg(NULL, KERN_WARNING, - "Can't set test_dummy_encryption on remount"); - return -1; - } -#endif + err = ext4_check_test_dummy_encryption(fc, sb); + if (err) + return err; if ((ctx->spec & EXT4_SPEC_DATAJ) && is_remount) { if (!sbi->s_journal) { @@ -5163,12 +5185,6 @@ no_journal: goto failed_mount_wq; } - if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && - !ext4_has_feature_encrypt(sb)) { - ext4_set_feature_encrypt(sb); - ext4_commit_super(sb); - } - /* * Get the # of file system overhead blocks from the * superblock if present. -- cgit v1.2.3