From ee6a12d0d4d85f3833d177cd382cd417f0ef011b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 5 Oct 2023 21:42:47 -0400 Subject: ext4: add missing initialization of call_notify_error in update_super_work() Fixes: ff0722de896e ("ext4: add periodic superblock update check") Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dbebd8b3127e..6f48dec19f4a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -768,7 +768,8 @@ static void update_super_work(struct work_struct *work) */ if (!sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; - bool call_notify_err; + bool call_notify_err = false; + handle = jbd2_journal_start(journal, 1); if (IS_ERR(handle)) goto write_directly; -- cgit v1.2.3 From 745f17a4166e79315e4b7f33ce89d03e75a76983 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 24 May 2023 15:25:38 +0800 Subject: ext4: fix race between writepages and remount We got a WARNING in ext4_add_complete_io: ================================================================== WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250 CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85 RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4] [...] Call Trace: ext4_end_bio+0xa8/0x240 [ext4] bio_endio+0x195/0x310 blk_update_request+0x184/0x770 scsi_end_request+0x2f/0x240 scsi_io_completion+0x75/0x450 scsi_finish_command+0xef/0x160 scsi_complete+0xa3/0x180 blk_complete_reqs+0x60/0x80 blk_done_softirq+0x25/0x40 __do_softirq+0x119/0x4c8 run_ksoftirqd+0x42/0x70 smpboot_thread_fn+0x136/0x3c0 kthread+0x140/0x1a0 ret_from_fork+0x2c/0x50 ================================================================== Above issue may happen as follows: cpu1 cpu2 ----------------------------|---------------------------- mount -o dioread_lock ext4_writepages ext4_do_writepages *if (ext4_should_dioread_nolock(inode))* // rsv_blocks is not assigned here mount -o remount,dioread_nolock ext4_journal_start_with_reserve __ext4_journal_start __ext4_journal_start_sb jbd2__journal_start *if (rsv_blocks)* // h_rsv_handle is not initialized here mpage_map_and_submit_extent mpage_map_one_extent dioread_nolock = ext4_should_dioread_nolock(inode) if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) mpd->io_submit.io_end->handle = handle->h_rsv_handle ext4_set_io_unwritten_flag io_end->flag |= EXT4_IO_END_UNWRITTEN // now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag scsi_finish_command scsi_io_completion scsi_io_completion_action scsi_end_request blk_update_request req_bio_endio bio_endio bio->bi_end_io > ext4_end_bio ext4_put_io_end_defer ext4_add_complete_io // trigger WARN_ON(!io_end->handle && sbi->s_journal); The immediate cause of this problem is that ext4_should_dioread_nolock() function returns inconsistent values in the ext4_do_writepages() and mpage_map_one_extent(). There are four conditions in this function that can be changed at mount time to cause this problem. These four conditions can be divided into two categories: (1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl (2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount The two in the first category have been fixed by commit c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling EXT4_EXTENTS_FL") respectively. Two cases in the other category have not yet been fixed, and the above issue is caused by this situation. We refer to the fix for the first category, when applying options during remount, we grab s_writepages_rwsem to avoid racing with writepages ops to trigger this problem. Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io") Cc: stable@vger.kernel.org Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230524072538.2883391-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/super.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9418359b1d9d..cd4ccae1e28a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1676,7 +1676,8 @@ struct ext4_sb_info { /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA - * or EXTENTS flag. + * or EXTENTS flag or between writepages ops and changing DELALLOC or + * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6f48dec19f4a..d062383ea50e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6443,6 +6443,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) struct ext4_mount_options old_opts; ext4_group_t g; int err = 0; + int alloc_ctx; #ifdef CONFIG_QUOTA int enable_quota = 0; int i, j; @@ -6483,7 +6484,16 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) } + /* + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause + * two calls to ext4_should_dioread_nolock() to return inconsistent + * values, triggering WARN_ON in ext4_add_complete_io(). we grab + * here s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + alloc_ctx = ext4_writepages_down_write(sb); ext4_apply_options(fc, sb); + ext4_writepages_up_write(sb, alloc_ctx); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -6701,6 +6711,8 @@ restore_opts: if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && sb_any_quota_suspended(sb)) dquot_resume(sb, -1); + + alloc_ctx = ext4_writepages_down_write(sb); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -6709,6 +6721,8 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + ext4_writepages_up_write(sb, alloc_ctx); + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA -- cgit v1.2.3 From a8c1eb77edfc92a64788dad70fedf9277cbafe76 Mon Sep 17 00:00:00 2001 From: Lu Hongfei Date: Mon, 29 May 2023 15:09:30 +0800 Subject: ext4: fix traditional comparison using max/min method It would be better to replace the traditional ternary conditional operator with max()/min() Signed-off-by: Lu Hongfei Reviewed-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230529070930.37949-1-luhongfei@vivo.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 79b20d6ae39e..4d08bb2bd184 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -111,10 +111,8 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, itbl_blk_start = ext4_inode_table(sb, gdp); itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; if (itbl_blk_start <= end && itbl_blk_end >= start) { - itbl_blk_start = itbl_blk_start >= start ? - itbl_blk_start : start; - itbl_blk_end = itbl_blk_end <= end ? - itbl_blk_end : end; + itbl_blk_start = max(itbl_blk_start, start); + itbl_blk_end = min(itbl_blk_end, end); itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); -- cgit v1.2.3 From ce774e5365e46be73ed055302c6de123a03394ea Mon Sep 17 00:00:00 2001 From: Jinke Han Date: Mon, 12 Jun 2023 20:40:17 +0800 Subject: ext4: make running and commit transaction have their own freed_data_list When releasing space in jbd, we traverse s_freed_data_list to get the free range belonging to the current commit transaction. In extreme cases, the time spent may not be small, and we have observed cases exceeding 10ms. This patch makes running and commit transactions manage their own free_data_list respectively, eliminating unnecessary traversal. And in the callback phase of the commit transaction, no one will touch it except the jbd thread itself, so s_md_lock is no longer needed. Signed-off-by: Jinke Han Reviewed-by: Zhang Yi Link: https://lore.kernel.org/r/20230612124017.14115-1-hanjinke.666@bytedance.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/mballoc.c | 18 +++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cd4ccae1e28a..3b8bc44be528 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1564,7 +1564,7 @@ struct ext4_sb_info { unsigned int *s_mb_maxs; unsigned int s_group_info_size; unsigned int s_mb_free_pending; - struct list_head s_freed_data_list; /* List of blocks to be freed + struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1e599305d85f..1d65c738c4c9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3631,7 +3631,8 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; - INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); @@ -3883,19 +3884,10 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); - struct list_head *cut_pos = NULL; + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; - spin_lock(&sbi->s_md_lock); - list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { - if (entry->efd_tid != commit_tid) - break; - cut_pos = &entry->efd_list; - } - if (cut_pos) - list_cut_position(&freed_data_list, &sbi->s_freed_data_list, - cut_pos); - spin_unlock(&sbi->s_md_lock); + list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); @@ -6378,7 +6370,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, } spin_lock(&sbi->s_md_lock); - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } -- cgit v1.2.3 From 40ea98396a3659062267d1fe5f99af4f7e4f05e3 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 24 Aug 2023 17:26:04 +0800 Subject: ext4: correct the start block of counting reserved clusters When big allocate feature is enabled, we need to count and update reserved clusters before removing a delayed only extent_status entry. {init|count|get}_rsvd() have already done this, but the start block number of this counting isn't correct in the following case. lblk end | | v v ------------------------- | | orig_es ------------------------- ^ ^ len1 is 0 | len2 | If the start block of the orig_es entry founded is bigger than lblk, we passed lblk as start block to count_rsvd(), but the length is correct, finally, the range to be counted is offset. This patch fix this by passing the start blocks to 'orig_es->lblk + len1'. Signed-off-by: Zhang Yi Cc: stable@kernel.org Link: https://lore.kernel.org/r/20230824092619.1327976-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/extents_status.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 6f7de14c0fa8..5e625ea4545d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1405,8 +1405,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, } } if (count_reserved) - count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, - &orig_es, &rc); + count_rsvd(inode, orig_es.es_lblk + len1, + orig_es.es_len - len1 - len2, &orig_es, &rc); goto out_get_reserved; } -- cgit v1.2.3 From 8e387c89e96b9543a339f84043cf9df15fed2632 Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Thu, 24 Aug 2023 17:26:05 +0800 Subject: ext4: make sure allocate pending entry not fail __insert_pending() allocate memory in atomic context, so the allocation could fail, but we are not handling that failure now. It could lead ext4_es_remove_extent() to get wrong reserved clusters, and the global data blocks reservation count will be incorrect. The same to extents_status entry preallocation, preallocate pending entry out of the i_es_lock with __GFP_NOFAIL, make sure __insert_pending() and __revise_pending() always succeeds. Signed-off-by: Zhang Yi Cc: stable@kernel.org Link: https://lore.kernel.org/r/20230824092619.1327976-3-yi.zhang@huaweicloud.com Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 123 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 89 insertions(+), 34 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 5e625ea4545d..f4b50652f0cc 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -152,8 +152,9 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei); -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc); int __init ext4_init_es(void) { @@ -448,6 +449,19 @@ static void ext4_es_list_del(struct inode *inode) spin_unlock(&sbi->s_es_lock); } +static inline struct pending_reservation *__alloc_pending(bool nofail) +{ + if (!nofail) + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); + + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); +} + +static inline void __free_pending(struct pending_reservation *pr) +{ + kmem_cache_free(ext4_pending_cachep, pr); +} + /* * Returns true if we cannot fail to allocate memory for this extent_status * entry and cannot reclaim it until its status changes. @@ -836,11 +850,12 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, { struct extent_status newes; ext4_lblk_t end = lblk + len - 1; - int err1 = 0; - int err2 = 0; + int err1 = 0, err2 = 0, err3 = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; + bool revise_pending = false; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -868,11 +883,17 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_insert_extent_check(inode, &newes); + revise_pending = sbi->s_cluster_ratio > 1 && + test_opt(inode->i_sb, DELALLOC) && + (status & (EXTENT_STATUS_WRITTEN | + EXTENT_STATUS_UNWRITTEN)); retry: if (err1 && !es1) es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3) && revise_pending && !pr) + pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, end, NULL, es1); @@ -897,13 +918,18 @@ retry: es2 = NULL; } - if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && - (status & EXTENT_STATUS_WRITTEN || - status & EXTENT_STATUS_UNWRITTEN)) - __revise_pending(inode, lblk, len); + if (revise_pending) { + err3 = __revise_pending(inode, lblk, len, &pr); + if (err3 != 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + if (err1 || err2 || err3) goto retry; ext4_es_print_tree(inode); @@ -1311,7 +1337,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end, rc->ndelonly--; node = rb_next(&pr->rb_node); rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); if (!node) break; pr = rb_entry(node, struct pending_reservation, @@ -1907,11 +1933,13 @@ static struct pending_reservation *__get_pending(struct inode *inode, * * @inode - file containing the cluster * @lblk - logical block in the cluster to be added + * @prealloc - preallocated pending entry * * Returns 0 on successful insertion and -ENOMEM on failure. If the * pending reservation is already in the set, returns successfully. */ -static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) +static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; @@ -1937,10 +1965,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) } } - pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); - if (pr == NULL) { - ret = -ENOMEM; - goto out; + if (likely(*prealloc == NULL)) { + pr = __alloc_pending(false); + if (!pr) { + ret = -ENOMEM; + goto out; + } + } else { + pr = *prealloc; + *prealloc = NULL; } pr->lclu = lclu; @@ -1970,7 +2003,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk) if (pr != NULL) { tree = &EXT4_I(inode)->i_pending_tree; rb_erase(&pr->rb_node, &tree->root); - kmem_cache_free(ext4_pending_cachep, pr); + __free_pending(pr); } } @@ -2029,10 +2062,10 @@ void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk, bool allocated) { struct extent_status newes; - int err1 = 0; - int err2 = 0; + int err1 = 0, err2 = 0, err3 = 0; struct extent_status *es1 = NULL; struct extent_status *es2 = NULL; + struct pending_reservation *pr = NULL; if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; @@ -2052,6 +2085,8 @@ retry: es1 = __es_alloc_extent(true); if ((err1 || err2) && !es2) es2 = __es_alloc_extent(true); + if ((err1 || err2 || err3) && allocated && !pr) + pr = __alloc_pending(true); write_lock(&EXT4_I(inode)->i_es_lock); err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); @@ -2074,11 +2109,18 @@ retry: es2 = NULL; } - if (allocated) - __insert_pending(inode, lblk); + if (allocated) { + err3 = __insert_pending(inode, lblk, &pr); + if (err3 != 0) + goto error; + if (pr) { + __free_pending(pr); + pr = NULL; + } + } error: write_unlock(&EXT4_I(inode)->i_es_lock); - if (err1 || err2) + if (err1 || err2 || err3) goto retry; ext4_es_print_tree(inode); @@ -2184,21 +2226,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk, * @inode - file containing the range * @lblk - logical block defining the start of range * @len - length of range in blocks + * @prealloc - preallocated pending entry * * Used after a newly allocated extent is added to the extents status tree. * Requires that the extents in the range have either written or unwritten * status. Must be called while holding i_es_lock. */ -static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) +static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len, + struct pending_reservation **prealloc) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_lblk_t end = lblk + len - 1; ext4_lblk_t first, last; bool f_del = false, l_del = false; + int ret = 0; if (len == 0) - return; + return 0; /* * Two cases - block range within single cluster and block range @@ -2219,7 +2264,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, f_del = __es_scan_range(inode, &ext4_es_is_delonly, first, lblk - 1); if (f_del) { - __insert_pending(inode, first); + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; } else { last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; @@ -2227,9 +2274,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, l_del = __es_scan_range(inode, &ext4_es_is_delonly, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, last); } } else { @@ -2237,18 +2286,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, if (first != lblk) f_del = __es_scan_range(inode, &ext4_es_is_delonly, first, lblk - 1); - if (f_del) - __insert_pending(inode, first); - else + if (f_del) { + ret = __insert_pending(inode, first, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, first); last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; if (last != end) l_del = __es_scan_range(inode, &ext4_es_is_delonly, end + 1, last); - if (l_del) - __insert_pending(inode, last); - else + if (l_del) { + ret = __insert_pending(inode, last, prealloc); + if (ret < 0) + goto out; + } else __remove_pending(inode, last); } +out: + return ret; } -- cgit v1.2.3 From 8fedebb5ea183994aca39af3f80623f5db42fff7 Mon Sep 17 00:00:00 2001 From: Wang Jianjian Date: Thu, 24 Aug 2023 23:23:24 +0800 Subject: ext4: fix incorrect offset The last argument of ext4_check_dir_entry is dentry offset int the file. Luckily this error only results in the wrong offset being printed in the eventual error message. Signed-off-by: Wang Jianjian Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/tencent_F992989953734FD5DE3F88ECB2191A856206@qq.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index bbda587f76b8..cbe756144d8d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2280,8 +2280,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, - (data2 + (blocksize - csum_size) - - (char *) de))) { + (char *)de - data2)) { brelse(bh2); brelse(bh); return -EFSCORRUPTED; -- cgit v1.2.3 From ebf6cb7c6e1241984f75f29f1bdbfa2fe7168f88 Mon Sep 17 00:00:00 2001 From: Wang Jianjian Date: Thu, 24 Aug 2023 23:56:31 +0800 Subject: ext4: no need to generate from free list in mballoc Commit 7a2fcbf7f85 ("ext4: don't use blocks freed but not yet committed in buddy cache init") added a code to mark as used blocks in the list of not yet committed freed blocks during initialization of a buddy page. However ext4_mb_free_metadata() makes sure buddy page is already loaded and takes a reference to it so it cannot happen that ext4_mb_init_cache() is called when efd list is non-empty. Just remove the ext4_mb_generate_from_freelist() call. Fixes: 7a2fcbf7f85('ext4: don't use blocks freed but not yet committed in buddy cache init') Signed-off-by: Wang Jianjian Link: https://lore.kernel.org/r/tencent_53CBCB1668358AE862684E453DF37B722008@qq.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/mballoc.c | 39 ++++++--------------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1d65c738c4c9..6e304c18d390 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -417,8 +417,6 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, @@ -1361,17 +1359,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) * We place the buddy block and bitmap block * close together */ + grinfo = ext4_get_group_info(sb, group); + if (!grinfo) { + err = -EFSCORRUPTED; + goto out; + } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); - grinfo = ext4_get_group_info(sb, group); - if (!grinfo) { - err = -EFSCORRUPTED; - goto out; - } grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * @@ -1398,7 +1396,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); - ext4_mb_generate_from_freelist(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -4950,31 +4948,6 @@ try_group_pa: return false; } -/* - * the function goes through all block freed in the group - * but not yet committed and marks them used in in-core bitmap. - * buddy must be generated from this bitmap - * Need to be called with the ext4 group lock held - */ -static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, - ext4_group_t group) -{ - struct rb_node *n; - struct ext4_group_info *grp; - struct ext4_free_data *entry; - - grp = ext4_get_group_info(sb, group); - if (!grp) - return; - n = rb_first(&(grp->bb_free_root)); - - while (n) { - entry = rb_entry(n, struct ext4_free_data, efd_node); - mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); - n = rb_next(n); - } -} - /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap -- cgit v1.2.3 From 31f13421c004a420c0e9d288859c9ea9259ea0cc Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:00 +0800 Subject: ext4: correct offset of gdb backup in non meta_bg group to update_backups Commit 0aeaa2559d6d5 ("ext4: fix corruption when online resizing a 1K bigalloc fs") found that primary superblock's offset in its group is not equal to offset of backup superblock in its group when block size is 1K and bigalloc is enabled. As group descriptor blocks are right after superblock, we can't pass block number of gdb to update_backups for the same reason. The root casue of the issue above is that leading 1K padding block is count as data block offset for primary block while backup block has no padding block offset in its group. Remove padding data block count to fix the issue for gdb backups. For meta_bg case, update_backups treat blk_off as block number, do no conversion in this case. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/resize.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0361c20910de..87cd5b07a970 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1601,6 +1601,8 @@ exit_journal: int gdb_num_end = ((group + flex_gd->count - 1) / EXT4_DESC_PER_BLOCK(sb)); int meta_bg = ext4_has_feature_meta_bg(sb); + sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - + ext4_group_first_block_no(sb, 0); sector_t old_gdb = 0; update_backups(sb, ext4_group_first_block_no(sb, 0), @@ -1612,8 +1614,8 @@ exit_journal: gdb_num); if (old_gdb == gdb_bh->b_blocknr) continue; - update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, - gdb_bh->b_size, meta_bg); + update_backups(sb, gdb_bh->b_blocknr - padding_blocks, + gdb_bh->b_data, gdb_bh->b_size, meta_bg); old_gdb = gdb_bh->b_blocknr; } } -- cgit v1.2.3 From 9adac8b01f4be28acd5838aade42b8daa4f0b642 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:01 +0800 Subject: ext4: add missed brelse in update_backups add missed brelse in update_backups Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/resize.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 87cd5b07a970..7cbc695b7005 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1191,8 +1191,10 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); if ((err = ext4_journal_get_write_access(handle, sb, bh, - EXT4_JTR_NONE))) + EXT4_JTR_NONE))) { + brelse(bh); break; + } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) -- cgit v1.2.3 From 48f1551592c54f7d8e2befc72a99ff4e47f7dca0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:02 +0800 Subject: ext4: correct return value of ext4_convert_meta_bg Avoid to ignore error in "err". Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230826174712.4059355-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/resize.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7cbc695b7005..9cb9ed968599 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1984,9 +1984,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) errout: ret = ext4_journal_stop(handle); - if (!err) - err = ret; - return ret; + return err ? err : ret; invalid_resize_inode: ext4_error(sb, "corrupted/inconsistent resize inode"); -- cgit v1.2.3 From 40dd7953f4d606c280074f10d23046b6812708ce Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:03 +0800 Subject: ext4: remove gdb backup copy for meta bg in setup_new_flex_group_blocks Wrong check of gdb backup in meta bg as following: first_group is the first group of meta_bg which contains target group, so target group is always >= first_group. We check if target group has gdb backup by comparing first_group with [group + 1] and [group + EXT4_DESC_PER_BLOCK(sb) - 1]. As group >= first_group, then [group + N] is > first_group. So no copy of gdb backup in meta bg is done in setup_new_flex_group_blocks. No need to do gdb backup copy in meta bg from setup_new_flex_group_blocks as we always copy updated gdb block to backups at end of ext4_flex_group_add as following: ext4_flex_group_add /* no gdb backup copy for meta bg any more */ setup_new_flex_group_blocks /* update current group number */ ext4_update_super sbi->s_groups_count += flex_gd->count; /* * if group in meta bg contains backup is added, the primary gdb block * of the meta bg will be copy to backup in new added group here. */ for (; gdb_num <= gdb_num_end; gdb_num++) update_backups(...) In summary, we can remove wrong gdb backup copy code in setup_new_flex_group_blocks. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/resize.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9cb9ed968599..667381180b26 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -560,13 +560,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) goto handle_itb; - if (meta_bg == 1) { - ext4_group_t first_group; - first_group = ext4_meta_bg_first_group(sb, group); - if (first_group != group + 1 && - first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) - goto handle_itb; - } + if (meta_bg == 1) + goto handle_itb; block = start + ext4_bg_has_super(sb, group); /* Copy all of the GDT blocks into the backup in this group */ -- cgit v1.2.3 From e44fc921b84ff08a9e2fb827a146fa4021d016f3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:04 +0800 Subject: ext4: fix typo in setup_new_flex_group_blocks grop -> group Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 667381180b26..5e4856d68c4a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -609,7 +609,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } handle_itb: - /* Initialize group tables of the grop @group */ + /* Initialize group tables of the group @group */ if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) goto handle_bb; -- cgit v1.2.3 From 7d4cd3b45af025befe3bca94f87359a6603b6e95 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:05 +0800 Subject: ext4: remove redundant check of count Remove zero check of count which is always non-zero. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5e4856d68c4a..c51994e22af5 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -699,16 +699,14 @@ handle_ib: block = start; } - if (count) { - err = set_flexbg_block_bitmap(sb, handle, - flex_gd, - EXT4_B2C(sbi, start), - EXT4_B2C(sbi, - start + count - - 1)); - if (err) - goto out; - } + err = set_flexbg_block_bitmap(sb, handle, + flex_gd, + EXT4_B2C(sbi, start), + EXT4_B2C(sbi, + start + count + - 1)); + if (err) + goto out; } out: -- cgit v1.2.3 From 31458077273b5f883d99bee33a7fb295f155712d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:06 +0800 Subject: ext4: remove commented code in reserve_backup_gdb Remove commented code in reserve_backup_gdb Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c51994e22af5..a370ca1a2bd6 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1080,9 +1080,6 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { int err2; data = (__le32 *)primary[i]->b_data; - /* printk("reserving backup %lu[%u] = %lu\n", - primary[i]->b_blocknr, gdbackups, - blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) -- cgit v1.2.3 From 1fc1bd2d18bbade157f7b14270f509ebbd89881b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:07 +0800 Subject: ext4: calculate free_clusters_count in cluster unit in verify_group_input The field free_cluster_count in struct ext4_new_group_data should be in units of clusters. In verify_group_input() this field is being filled in units of blocks. Fortunately, we don't support online resizing of bigalloc file systems, and for non-bigalloc file systems, the cluster size == block size. But fix this in case we do support online resizing of bigalloc file systems in the future. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a370ca1a2bd6..3ad2b1a900ad 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -154,8 +154,9 @@ static int verify_group_input(struct super_block *sb, overhead = ext4_group_overhead_blocks(sb, group); metaend = start + overhead; - input->free_clusters_count = free_blocks_count = - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; + free_blocks_count = input->blocks_count - 2 - overhead - + sbi->s_itb_per_group; + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " -- cgit v1.2.3 From 95b635689b58e0ebe5197bf99c82c681eabe17ee Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:08 +0800 Subject: ext4: remove EXT4FS_DEBUG defination in resize.c Remove EXT4FS_DEBUG defination in resize.c for following reasons: 1. EXT4FS_DEBUG will enable debug messages, it should only be defined when debugging. 2. ext4.h included from ext4_jbd2.h after EXT4FS_DEBUG defination will "#undef EXT4FS_DEBUG", then EXT4FS_DEBUG defination in resize.c can't actually turn on ext4_debug messages. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 3ad2b1a900ad..d4ab2cde0a03 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -10,8 +10,6 @@ */ -#define EXT4FS_DEBUG - #include #include #include -- cgit v1.2.3 From 70cbfd257995b3f23c2408fd893cc18b61e58b4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:09 +0800 Subject: ext4: use saved local variable sbi instead of EXT4_SB(sb) We save EXT4_SB(sb) to local variable sbi at beginning of function ext4_resize_begin. Use sbi directly instead of EXT4_SB(sb) to remove unnecessary pointer dereference. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index d4ab2cde0a03..e2c81b589de8 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -55,7 +55,7 @@ int ext4_resize_begin(struct super_block *sb) * If the reserved GDT blocks is non-zero, the resize_inode feature * should always be set. */ - if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && + if (sbi->s_es->s_reserved_gdt_blocks && !ext4_has_feature_resize_inode(sb)) { ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); return -EFSCORRUPTED; @@ -67,9 +67,9 @@ int ext4_resize_begin(struct super_block *sb) * bad time to do it anyways. */ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + le32_to_cpu(sbi->s_es->s_first_data_block)) { ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + (unsigned long long)sbi->s_sbh->b_blocknr); return -EPERM; } @@ -77,7 +77,7 @@ int ext4_resize_begin(struct super_block *sb) * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + if (sbi->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " "so online resizing is not allowed"); return -EPERM; @@ -89,7 +89,7 @@ int ext4_resize_begin(struct super_block *sb) } if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, - &EXT4_SB(sb)->s_ext4_flags)) + &sbi->s_ext4_flags)) ret = -EBUSY; return ret; -- cgit v1.2.3 From 9dca529bdaad7a7242a36d04f73cb998b817ab48 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:10 +0800 Subject: ext4: simplify the gdbblock calculation in add_new_gdb_meta_bg We always call add_new_gdb_meta_bg with first group in mete_bg. Remove the unnecessary ext4_meta_bg_first_group conversion to simplify the gdbblock calculation. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230826174712.4059355-12-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e2c81b589de8..5e9084ff131c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -104,18 +104,6 @@ int ext4_resize_end(struct super_block *sb, bool update_backups) return 0; } -static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, - ext4_group_t group) { - return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << - EXT4_DESC_PER_BLOCK_BITS(sb); -} - -static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, - ext4_group_t group) { - group = ext4_meta_bg_first_group(sb, group); - return ext4_group_first_block_no(sb, group); -} - static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ext4_group_t group) { ext4_grpblk_t overhead; @@ -944,7 +932,13 @@ errout: } /* - * add_new_gdb_meta_bg is the sister of add_new_gdb. + * If there is no available space in the existing block group descriptors for + * the new block group and there are no reserved block group descriptors, then + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set + * to the first block group that is managed using meta_bg and s_first_meta_bg + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). + * This function will be called when first group of meta_bg is added to bring + * new group descriptors block of new added meta_bg. */ static int add_new_gdb_meta_bg(struct super_block *sb, handle_t *handle, ext4_group_t group) { @@ -954,8 +948,8 @@ static int add_new_gdb_meta_bg(struct super_block *sb, unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); int err; - gdblock = ext4_meta_bg_first_block_no(sb, group) + - ext4_bg_has_super(sb, group); + gdblock = ext4_group_first_block_no(sb, group) + + ext4_bg_has_super(sb, group); gdb_bh = ext4_sb_bread(sb, gdblock, 0); if (IS_ERR(gdb_bh)) return PTR_ERR(gdb_bh); -- cgit v1.2.3 From 350bb48b84b8f4ad4ea179dbb97f568d12626188 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:11 +0800 Subject: ext4: remove unnecessary check to avoid repeat update_backups for the same gdb The sbi->s_group_desc contains array of bh's for block group descriptors and continuous EXT4_DESC_PER_BLOCK(sb) bg descriptors in single block share the same bh. Simply call update_backups for each gdb_bh in sbi->s_group_desc will not update same group descriptors block for multiple times. Commit 0acdb8876fead ("ext4: don't call update_backups() multiple times for the same bg") wrongly assumed each block group descriptor in the same block has a individual bh and unnecessary check was added. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230826174712.4059355-13-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 + fs/ext4/resize.c | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3b8bc44be528..5ab49226cd9f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1494,6 +1494,7 @@ struct ext4_sb_info { loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 5e9084ff131c..718b47e131e4 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1588,7 +1588,6 @@ exit_journal: int meta_bg = ext4_has_feature_meta_bg(sb); sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - ext4_group_first_block_no(sb, 0); - sector_t old_gdb = 0; update_backups(sb, ext4_group_first_block_no(sb, 0), (char *)es, sizeof(struct ext4_super_block), 0); @@ -1597,11 +1596,8 @@ exit_journal: gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); - if (old_gdb == gdb_bh->b_blocknr) - continue; update_backups(sb, gdb_bh->b_blocknr - padding_blocks, gdb_bh->b_data, gdb_bh->b_size, meta_bg); - old_gdb = gdb_bh->b_blocknr; } } exit: -- cgit v1.2.3 From 248b45b621a77155f81129e6b572ec833edb4cf4 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 27 Aug 2023 01:47:12 +0800 Subject: ext4: remove unnecessary initialization of count2 in set_flexbg_block_bitmap We always overwrite count2 to "EXT4_CLUSTERS_PER_GROUP(sb) - (first_cluster - start)" after its initialization in for loop initialization statement . Just remove unnecessary initialization of count2. Signed-off-by: Kemeng Shi Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230826174712.4059355-14-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/resize.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 718b47e131e4..4fe061edefdd 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -447,8 +447,7 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, last_cluster); - for (count2 = count; count > 0; - count -= count2, first_cluster += count2) { + for (; count > 0; count -= count2, first_cluster += count2) { ext4_fsblk_t start; struct buffer_head *bh; ext4_group_t group; -- cgit v1.2.3 From 71cd5a5aa0607073adba3852739b7f8c22bc7b50 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 4 Sep 2023 18:58:16 +0800 Subject: jbd2: print io_block if check data block checksum failed when do recovery Now, if check data block checksum failed only print data's block number then skip write data. However, one data block may in more than one transaction. In some scenarios, offline analysis is inconvenient. As a result, it is difficult to locate the areas where data is faulty. So print 'io_block' if check data block checksum failed. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230904105817.1728356-2-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c269a7d29a46..11380ff1fe51 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -661,7 +661,8 @@ static int do_one_pass(journal_t *journal, printk(KERN_ERR "JBD2: Invalid " "checksum recovering " "data block %llu in " - "log\n", blocknr); + "journal block %lu\n", + blocknr, io_block); block_error = 1; goto skip_write; } -- cgit v1.2.3 From 8b6b562121f1981315e76b6ae1f8a5cbbcf14bd7 Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 4 Sep 2023 18:58:17 +0800 Subject: jbd2: fix printk format type for 'io_block' in do_one_pass() 'io_block' is unsinged long but print it by '%ld'. Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230904105817.1728356-3-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 11380ff1fe51..0567f1aa189b 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -632,7 +632,7 @@ static int do_one_pass(journal_t *journal, success = err; printk(KERN_ERR "JBD2: IO error %d recovering " - "block %ld in log\n", + "block %lu in log\n", err, io_block); } else { unsigned long long blocknr; -- cgit v1.2.3 From af90a8f4a09ec4a3de20142e37f37205d4687f28 Mon Sep 17 00:00:00 2001 From: Gou Hao Date: Wed, 6 Sep 2023 09:33:41 +0800 Subject: ext4: move 'ix' sanity check to corrent position Check 'ix' before it is used. Fixes: 80e675f906db ("ext4: optimize memmmove lengths in extent/index insertions") Signed-off-by: Gou Hao Link: https://lore.kernel.org/r/20230906013341.7199-1-gouhao@uniontech.com Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 202c76996b62..4d8496d1a8ac 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1010,6 +1010,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ix = curp->p_idx; } + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); + return -EFSCORRUPTED; + } + len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { @@ -1019,11 +1024,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); } - if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { - EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); - return -EFSCORRUPTED; - } - ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); le16_add_cpu(&curp->p_hdr->eh_entries, 1); -- cgit v1.2.3 From 2cd8bdb5efc1e0d5b11a4b7ba6b922fd2736a87f Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Mon, 18 Sep 2023 16:15:50 +0530 Subject: ext4: mark buffer new if it is unwritten to avoid stale data exposure ** Short Version ** In ext4 with dioread_nolock, we could have a scenario where the bh returned by get_blocks (ext4_get_block_unwritten()) in __block_write_begin_int() has UNWRITTEN and MAPPED flag set. Since such a bh does not have NEW flag set we never zero out the range of bh that is not under write, causing whatever stale data is present in the folio at that time to be written out to disk. To fix this mark the buffer as new, in case it is unwritten, in ext4_get_block_unwritten(). ** Long Version ** The issue mentioned above was resulting in two different bugs: 1. On block size < page size case in ext4, generic/269 was reliably failing with dioread_nolock. The state of the write was as follows: * The write was extending i_size. * The last block of the file was fallocated and had an unwritten extent * We were near ENOSPC and hence we were switching to non-delayed alloc allocation. In this case, the back trace that triggers the bug is as follows: ext4_da_write_begin() /* switch to nodelalloc due to low space */ ext4_write_begin() ext4_should_dioread_nolock() // true since mount flags still have delalloc __block_write_begin(..., ext4_get_block_unwritten) __block_write_begin_int() for(each buffer head in page) { /* first iteration, this is bh1 which contains i_size */ if (!buffer_mapped) get_block() /* returns bh with only UNWRITTEN and MAPPED */ /* second iteration, bh2 */ if (!buffer_mapped) get_block() /* we fail here, could be ENOSPC */ } if (err) /* * this would zero out all new buffers and mark them uptodate. * Since bh1 was never marked new, we skip it here which causes * the bug later. */ folio_zero_new_buffers(); /* ext4_wrte_begin() error handling */ ext4_truncate_failed_write() ext4_truncate() ext4_block_truncate_page() __ext4_block_zero_page_range() if(!buffer_uptodate()) ext4_read_bh_lock() ext4_read_bh() -> ... ext4_submit_bh_wbc() BUG_ON(buffer_unwritten(bh)); /* !!! */ 2. The second issue is stale data exposure with page size >= blocksize with dioread_nolock. The conditions needed for it to happen are same as the previous issue ie dioread_nolock around ENOSPC condition. The issue is also similar where in __block_write_begin_int() when we call ext4_get_block_unwritten() on the buffer_head and the underlying extent is unwritten, we get an unwritten and mapped buffer head. Since it is not new, we never zero out the partial range which is not under write, thus writing stale data to disk. This can be easily observed with the following reproducer: fallocate -l 4k testfile xfs_io -c "pwrite 2k 2k" testfile # hexdump output will have stale data in from byte 0 to 2k in testfile hexdump -C testfile NOTE: To trigger this, we need dioread_nolock enabled and write happening via ext4_write_begin(), which is usually used when we have -o nodealloc. Since dioread_nolock is disabled with nodelalloc, the only alternate way to call ext4_write_begin() is to ensure that delayed alloc switches to nodelalloc ie ext4_da_write_begin() calls ext4_write_begin(). This will usually happen when ext4 is almost full like the way generic/269 was triggering it in Issue 1 above. This might make the issue harder to hit. Hence, for reliable replication, I used the below patch to temporarily allow dioread_nolock with nodelalloc and then mount the disk with -o nodealloc,dioread_nolock. With this you can hit the stale data issue 100% of times: @@ -508,8 +508,8 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) if (ext4_should_journal_data(inode)) return 0; /* temporary fix to prevent generic/422 test failures */ - if (!test_opt(inode->i_sb, DELALLOC)) - return 0; + // if (!test_opt(inode->i_sb, DELALLOC)) + // return 0; return 1; } After applying this patch to mark buffer as NEW, both the above issues are fixed. Signed-off-by: Ojaswin Mujoo Cc: stable@kernel.org Reviewed-by: Jan Kara Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/d0ed09d70a9733fbb5349c5c7b125caac186ecdf.1695033645.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ce35f1c8b0a..d7732320431a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -789,10 +789,22 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + int ret = 0; + ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, + ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + + /* + * If the buffer is marked unwritten, mark it as new to make sure it is + * zeroed out correctly in case of partial writes. Otherwise, there is + * a chance of stale data getting exposed. + */ + if (ret == 0 && buffer_unwritten(bh_result)) + set_buffer_new(bh_result); + + return ret; } /* Maximum number of blocks we map for direct IO at once. */ -- cgit v1.2.3 From 484fd6c1de13b336806a967908a927cc0356e312 Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Tue, 19 Sep 2023 10:18:23 +0200 Subject: ext4: apply umask if ACL support is disabled The function ext4_init_acl() calls posix_acl_create() which is responsible for applying the umask. But without CONFIG_EXT4_FS_POSIX_ACL, ext4_init_acl() is an empty inline function, and nobody applies the umask. This fixes a bug which causes the umask to be ignored with O_TMPFILE on ext4: https://github.com/MusicPlayerDaemon/MPD/issues/558 https://bugs.gentoo.org/show_bug.cgi?id=686142#c3 https://bugzilla.kernel.org/show_bug.cgi?id=203625 Reviewed-by: "J. Bruce Fields" Cc: stable@vger.kernel.org Signed-off-by: Max Kellermann Link: https://lore.kernel.org/r/20230919081824.1096619-1-max.kellermann@ionos.com Signed-off-by: Theodore Ts'o --- fs/ext4/acl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 0c5a79c3b5d4..ef4c19e5f570 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -68,6 +68,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); static inline int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { + /* usually, the umask is applied by posix_acl_create(), but if + ext4 ACL support is disabled at compile time, we need to do + it here, because posix_acl_create() will never be called */ + inode->i_mode &= ~current_umask(); + return 0; } #endif /* CONFIG_EXT4_FS_POSIX_ACL */ -- cgit v1.2.3 From 61187fce8600e8ef90e601be84f9d0f3222c1206 Mon Sep 17 00:00:00 2001 From: Zhihao Cheng Date: Tue, 19 Sep 2023 09:25:25 +0800 Subject: jbd2: fix potential data lost in recovering journal raced with synchronizing fs bdev JBD2 makes sure journal data is fallen on fs device by sync_blockdev(), however, other process could intercept the EIO information from bdev's mapping, which leads journal recovering successful even EIO occurs during data written back to fs device. We found this problem in our product, iscsi + multipath is chosen for block device of ext4. Unstable network may trigger kpartx to rescan partitions in device mapper layer. Detailed process is shown as following: mount kpartx irq jbd2_journal_recover do_one_pass memcpy(nbh->b_data, obh->b_data) // copy data to fs dev from journal mark_buffer_dirty // mark bh dirty vfs_read generic_file_read_iter // dio filemap_write_and_wait_range __filemap_fdatawrite_range do_writepages block_write_full_folio submit_bh_wbc >> EIO occurs in disk << end_buffer_async_write mark_buffer_write_io_error mapping_set_error set_bit(AS_EIO, &mapping->flags) // set! filemap_check_errors test_and_clear_bit(AS_EIO, &mapping->flags) // clear! err2 = sync_blockdev filemap_write_and_wait filemap_check_errors test_and_clear_bit(AS_EIO, &mapping->flags) // false err2 = 0 Filesystem is mounted successfully even data from journal is failed written into disk, and ext4/ocfs2 could become corrupted. Fix it by comparing the wb_err state in fs block device before recovering and after recovering. A reproducer can be found in the kernel bugzilla referenced below. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217888 Cc: stable@vger.kernel.org Signed-off-by: Zhihao Cheng Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230919012525.1783108-1-chengzhihao1@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/recovery.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 0567f1aa189b..01f744cb97a4 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -289,6 +289,8 @@ int jbd2_journal_recover(journal_t *journal) journal_superblock_t * sb; struct recovery_info info; + errseq_t wb_err; + struct address_space *mapping; memset(&info, 0, sizeof(info)); sb = journal->j_superblock; @@ -306,6 +308,9 @@ int jbd2_journal_recover(journal_t *journal) return 0; } + wb_err = 0; + mapping = journal->j_fs_dev->bd_inode->i_mapping; + errseq_check_and_advance(&mapping->wb_err, &wb_err); err = do_one_pass(journal, &info, PASS_SCAN); if (!err) err = do_one_pass(journal, &info, PASS_REVOKE); @@ -327,6 +332,9 @@ int jbd2_journal_recover(journal_t *journal) jbd2_journal_clear_revoke(journal); err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err); if (!err) err = err2; /* Make sure all replayed data is on permanent storage */ -- cgit v1.2.3 From d2f7cf40ea89b486fb7a25f5ab9a5d3ce26fb4bb Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:03:56 +0800 Subject: ext4: make state in ext4_mb_mark_bb to be bool As state could only be either 0 or 1, just make it bool. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/extents.c | 4 ++-- fs/ext4/fast_commit.c | 8 ++++---- fs/ext4/mballoc.c | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5ab49226cd9f..8ccebe0d4fbc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2926,7 +2926,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state); + int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4d8496d1a8ac..880f383df684 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -6080,13 +6080,13 @@ int ext4_ext_clear_bb(struct inode *inode) for (j = 0; j < path->p_depth; j++) { ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 0); + path[j].p_block, 1, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, 0, path[j].p_block, 1, 1); } ext4_free_ext_path(path); } - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); ext4_fc_record_regions(inode->i_sb, inode->i_ino, map.m_lblk, map.m_pblk, map.m_len, 1); } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index b06de728b3b6..87c009e0c59a 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1806,7 +1806,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, * at the end of the FC replay using our array of * modified inodes. */ - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } @@ -1875,7 +1875,7 @@ ext4_fc_replay_del_range(struct super_block *sb, if (ret > 0) { remaining -= ret; cur += ret; - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; @@ -1934,12 +1934,12 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, - path[j].p_block, 1, 1); + path[j].p_block, 1, true); ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, - map.m_len, 1); + map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 6e304c18d390..dc2024dd3c27 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4067,7 +4067,7 @@ out_err: * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, - int len, int state) + int len, bool state) { struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; @@ -6095,7 +6095,7 @@ ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); - ext4_mb_mark_bb(sb, block, 1, 1); + ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; return block; -- cgit v1.2.3 From f9e2d95a45321857bd0397aa07ae30cc20f0988f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:03:57 +0800 Subject: ext4: factor out codes to update block bitmap and group descriptor on disk from ext4_mb_mark_bb There are several reasons to add a general function ext4_mb_mark_context to update block bitmap and group descriptor on disk: 1. pair behavior of alloc/free bits. For example, ext4_mb_new_blocks_simple will update free_clusters in struct flex_groups in ext4_mb_mark_bb while ext4_free_blocks_simple forgets this. 2. remove repeat code to read from disk, update and write back to disk. 3. reduce future unit test mocks to catch real IO to update structure on disk. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 147 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 70 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dc2024dd3c27..1629bd494167 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3943,6 +3943,80 @@ void ext4_exit_mballoc(void) ext4_groupinfo_destroy_slabs(); } +static int +ext4_mb_mark_context(struct super_block *sb, bool state, ext4_group_t group, + ext4_grpblk_t blkoff, ext4_grpblk_t len) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct buffer_head *bitmap_bh = NULL; + struct ext4_group_desc *gdp; + struct buffer_head *gdp_bh; + int err; + unsigned int i, already, changed; + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) + return PTR_ERR(bitmap_bh); + + err = -EIO; + gdp = ext4_get_group_desc(sb, group, &gdp_bh); + if (!gdp) + goto out_err; + + ext4_lock_group(sb, group); + if (ext4_has_group_desc_csum(sb) && + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_clusters_after_init(sb, group, gdp)); + } + + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + + if (state) { + mb_set_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) - changed); + } else { + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + ext4_free_group_clusters_set(sb, gdp, + ext4_free_group_clusters(sb, gdp) + changed); + } + + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, group, gdp); + ext4_unlock_group(sb, group); + + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, group); + struct flex_groups *fg = sbi_array_rcu_deref(sbi, + s_flex_groups, flex_group); + + if (state) + atomic64_sub(changed, &fg->free_clusters); + else + atomic64_add(changed, &fg->free_clusters); + } + + err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); + if (err) + goto out_err; + err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); + if (err) + goto out_err; + + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + +out_err: + brelse(bitmap_bh); + return err; +} /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps @@ -4069,15 +4143,11 @@ out_err: void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state) { - struct buffer_head *bitmap_bh = NULL; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; - int i, err = 0; - int already; - unsigned int clen, clen_changed, thisgrp_len; + int err = 0; + unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); @@ -4098,80 +4168,17 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); - bitmap_bh = NULL; break; } - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - break; - } - - err = -EIO; - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - break; - - ext4_lock_group(sb, group); - already = 0; - for (i = 0; i < clen; i++) - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == - !state) - already++; - - clen_changed = clen - already; - if (state) - mb_set_bits(bitmap_bh->b_data, blkoff, clen); - else - mb_clear_bits(bitmap_bh->b_data, blkoff, clen); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, group, gdp)); - } - if (state) - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; - else - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; - - ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - - ext4_unlock_group(sb, group); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group); - struct flex_groups *fg = sbi_array_rcu_deref(sbi, - s_flex_groups, flex_group); - - if (state) - atomic64_sub(clen_changed, &fg->free_clusters); - else - atomic64_add(clen_changed, &fg->free_clusters); - - } - - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - break; - sync_dirty_buffer(bitmap_bh); - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(gdp_bh); + err = ext4_mb_mark_context(sb, state, group, blkoff, clen); if (err) break; block += thisgrp_len; len -= thisgrp_len; - brelse(bitmap_bh); BUG_ON(len < 0); } - - if (err) - brelse(bitmap_bh); } /* -- cgit v1.2.3 From 26d0f87b9fff37f9d4a04aa49f40b0d24502d2b1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:03:58 +0800 Subject: ext4: call ext4_mb_mark_context in ext4_free_blocks_simple call ext4_mb_mark_context in ext4_free_blocks_simple to: 1. remove repeat code 2. pair update of free_clusters in ext4_mb_new_blocks_simple. 3. add missing ext4_lock_group/ext4_unlock_group protection. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 1629bd494167..3761934c9398 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6358,43 +6358,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; ext4_group_t group; ext4_grpblk_t blkoff; - int already_freed = 0, err, i; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - bitmap_bh = ext4_read_block_bitmap(sb, group); - if (IS_ERR(bitmap_bh)) { - pr_warn("Failed to read block bitmap\n"); - return; - } - gdp = ext4_get_group_desc(sb, group, &gdp_bh); - if (!gdp) - goto err_out; - - for (i = 0; i < count; i++) { - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) - already_freed++; - } - mb_clear_bits(bitmap_bh->b_data, blkoff, count); - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); - if (err) - goto err_out; - ext4_free_group_clusters_set( - sb, gdp, ext4_free_group_clusters(sb, gdp) + - count - already_freed); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, group, gdp); - ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); - sync_dirty_buffer(bitmap_bh); - sync_dirty_buffer(gdp_bh); - -err_out: - brelse(bitmap_bh); + ext4_mb_mark_context(sb, false, group, blkoff, count); } /** -- cgit v1.2.3 From c431d3867e0a8258d1948408eb8297cdd2400e67 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:03:59 +0800 Subject: ext4: extend ext4_mb_mark_context to support allocation under journal Previously, ext4_mb_mark_context is only called under fast commit replay path, so there is no valid handle when we update block bitmap and group descriptor. This patch try to extend ext4_mb_mark_context to be used by code under journal. There are several improvement: 1. Add "handle_t *handle" to struct ext4_mark_context to journal block bitmap and group descriptor update inside ext4_mb_mark_context (the added journal code is based on ext4_mb_mark_diskspace_used where ext4_mb_mark_context is going to be used.) 2. Adds a flag argument to ext4_mb_mark_context() which controls a. EXT4_MB_BITMAP_MARKED_CHECK - whether block bitmap checking is needed. b. EXT4_MB_SYNC_UPDATE - whether dirty buffers (bitmap and group descriptor) needs sync. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 64 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3761934c9398..5b3940e3afea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3943,26 +3943,47 @@ void ext4_exit_mballoc(void) ext4_groupinfo_destroy_slabs(); } +#define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 +#define EXT4_MB_SYNC_UPDATE 0x0002 static int -ext4_mb_mark_context(struct super_block *sb, bool state, ext4_group_t group, - ext4_grpblk_t blkoff, ext4_grpblk_t len) +ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; int err; - unsigned int i, already, changed; + unsigned int i, already, changed = len; + if (ret_changed) + *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) return PTR_ERR(bitmap_bh); + if (handle) { + BUFFER_TRACE(bitmap_bh, "getting write access"); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + err = -EIO; gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) goto out_err; + if (handle) { + BUFFER_TRACE(gdp_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, + EXT4_JTR_NONE); + if (err) + goto out_err; + } + ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { @@ -3971,12 +3992,14 @@ ext4_mb_mark_context(struct super_block *sb, bool state, ext4_group_t group, ext4_free_clusters_after_init(sb, group, gdp)); } - already = 0; - for (i = 0; i < len; i++) - if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == - state) - already++; - changed = len - already; + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { + already = 0; + for (i = 0; i < len; i++) + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == + state) + already++; + changed = len - already; + } if (state) { mb_set_bits(bitmap_bh->b_data, blkoff, len); @@ -3991,6 +4014,8 @@ ext4_mb_mark_context(struct super_block *sb, bool state, ext4_group_t group, ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); + if (ret_changed) + *ret_changed = changed; if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); @@ -4003,15 +4028,17 @@ ext4_mb_mark_context(struct super_block *sb, bool state, ext4_group_t group, atomic64_add(changed, &fg->free_clusters); } - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); if (err) goto out_err; - sync_dirty_buffer(bitmap_bh); - sync_dirty_buffer(gdp_bh); + if (flags & EXT4_MB_SYNC_UPDATE) { + sync_dirty_buffer(bitmap_bh); + sync_dirty_buffer(gdp_bh); + } out_err: brelse(bitmap_bh); @@ -4171,7 +4198,11 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, break; } - err = ext4_mb_mark_context(sb, state, group, blkoff, clen); + err = ext4_mb_mark_context(NULL, sb, state, + group, blkoff, clen, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); if (err) break; @@ -6363,7 +6394,10 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_grpblk_t blkoff; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); - ext4_mb_mark_context(sb, false, group, blkoff, count); + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, + EXT4_MB_BITMAP_MARKED_CHECK | + EXT4_MB_SYNC_UPDATE, + NULL); } /** -- cgit v1.2.3 From 2f94711b098bc55285ac74aba2f2b83fd72b555f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:00 +0800 Subject: ext4: call ext4_mb_mark_context in ext4_mb_mark_diskspace_used Call ext4_mb_mark_context in ext4_mb_mark_diskspace_used to: 1. Remove repeat code to normally update bitmap and group descriptor on disk. 2. Now that we have a common API for marking blocks inuse/free in block bitmap, use that instead of open coding it in function ext4_mb_mark_diskspace_used(). The current code was not updating checksum and other counters. ext4_mb_mark_context() should fix these consistency problems. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 86 +++++++++++++------------------------------------------ 1 file changed, 20 insertions(+), 66 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b3940e3afea..7d1b78fd6c97 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4053,13 +4053,13 @@ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { - struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; - struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; + int flags = 0; + ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); @@ -4067,32 +4067,13 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); - if (IS_ERR(bitmap_bh)) { - return PTR_ERR(bitmap_bh); - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto out_err; - - err = -EIO; - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) - goto out_err; - + return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); - BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); - if (err) - goto out_err; - block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); - len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " @@ -4101,41 +4082,29 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ - ext4_lock_group(sb, ac->ac_b_ex.fe_group); - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + err = ext4_mb_mark_context(handle, sb, true, + ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len, + 0, NULL); if (!err) err = -EFSCORRUPTED; - goto out_err; + return err; } - ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, - bitmap_bh->b_data)); - } - } + flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); - if (ext4_has_group_desc_csum(sb) && - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - ext4_free_group_clusters_set(sb, gdp, - ext4_free_clusters_after_init(sb, - ac->ac_b_ex.fe_group, gdp)); - } - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; - ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, + flags, &changed); + + if (err && changed == 0) + return err; - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != ac->ac_b_ex.fe_len); +#endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -4145,21 +4114,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, - ac->ac_b_ex.fe_group); - atomic64_sub(ac->ac_b_ex.fe_len, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - if (err) - goto out_err; - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); - -out_err: - brelse(bitmap_bh); return err; } -- cgit v1.2.3 From 33e728c67db6b7e72fc8d09f7a38f72302b8a98d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:01 +0800 Subject: ext4: Separate block bitmap and buddy bitmap freeing in ext4_mb_clear_bb() This patch separates block bitmap and buddy bitmap freeing in order to update block bitmap with ext4_mb_mark_context in following patch. Separated freeing is safe with concurrent allocation as long as: 1. Firstly allocate block in buddy bitmap, and then in block bitmap. 2. Firstly free block in block bitmap, and then buddy bitmap. Then freed block will only be available to allocation when both buddy bitmap and block bitmap are updated by freeing. Allocation obeys rule 1 already, just do sperated freeing with rule 2. Separated freeing has no race with generate_buddy as: Once ext4_mb_load_buddy_gfp is executed successfully, the update-to-date buddy page can be found in sbi->s_buddy_cache and no more buddy initialization of the buddy page will be executed concurrently until buddy page is unloaded. As we always do free in "load buddy, free, unload buddy" sequence, separated freeing has no race with generate_buddy. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 98 +++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7d1b78fd6c97..124bc5a90e79 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6388,7 +6388,7 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; @@ -6412,31 +6412,39 @@ do_more: flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); + trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + + /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) + goto error_out; + + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && + !ext4_inode_block_valid(inode, block, count)) { + ext4_error(sb, "Freeing blocks in system zone - " + "Block = %llu, count = %lu", block, count); + /* err = 0. ext4_std_error should be a no op */ + goto error_clean; + } + bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; - goto error_return; + goto error_clean; } gdp = ext4_get_group_desc(sb, block_group, &gd_bh); if (!gdp) { err = -EIO; - goto error_return; - } - - if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_inode_block_valid(inode, block, count)) { - ext4_error(sb, "Freeing blocks in system zone - " - "Block = %llu, count = %lu", block, count); - /* err = 0. ext4_std_error should be a no op */ - goto error_return; + goto error_clean; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) - goto error_return; + goto error_clean; /* * We are about to modify some metadata. Call the journal APIs @@ -6446,7 +6454,7 @@ do_more: BUFFER_TRACE(gd_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) - goto error_return; + goto error_clean; #ifdef AGGRESSIVE_CHECK { int i; @@ -6454,13 +6462,30 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); + ret = ext4_free_group_clusters(sb, gdp) + count_clusters; + ext4_free_group_clusters_set(sb, gdp, ret); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); + ext4_group_desc_csum_set(sb, block_group, gdp); + ext4_unlock_group(sb, block_group); - /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ - err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, - GFP_NOFS|__GFP_NOFAIL); - if (err) - goto error_return; + if (sbi->s_log_groups_per_flex) { + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); + atomic64_add(count_clusters, + &sbi_array_rcu_deref(sbi, s_flex_groups, + flex_group)->free_clusters); + } + + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + + /* And the group descriptor block */ + BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; /* * We need to make sure we don't reuse the freed block until after the @@ -6484,13 +6509,8 @@ do_more: new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { - /* need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters, NULL); @@ -6503,23 +6523,11 @@ do_more: EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); mb_free_blocks(inode, &e4b, bit, count_clusters); } - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(count_clusters, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they @@ -6532,28 +6540,20 @@ do_more: count_clusters); } - ext4_mb_unload_buddy(&e4b); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; - if (overflow && !err) { block += count; count = overflow; + ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } -error_return: + +error_clean: + ext4_mb_unload_buddy(&e4b); brelse(bitmap_bh); +error_out: ext4_std_error(sb, err); } -- cgit v1.2.3 From 38b8f70cd28c6ad21192d82f271b9e32bdcb8600 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:02 +0800 Subject: ext4: call ext4_mb_mark_context in ext4_mb_clear_bb Call ext4_mb_mark_context in ext4_mb_clear_bb to remove repeat code. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 69 ++++++++----------------------------------------------- 1 file changed, 10 insertions(+), 59 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 124bc5a90e79..d90fb5660429 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6367,19 +6367,17 @@ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { - struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_group_desc *gdp; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; - struct buffer_head *gd_bh; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; - int ret; + int mark_flags = 0; + ext4_grpblk_t changed; sbi = EXT4_SB(sb); @@ -6428,64 +6426,19 @@ do_more: goto error_clean; } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_clean; - } - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!gdp) { - err = -EIO; - goto error_clean; - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_clean; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) - goto error_clean; #ifdef AGGRESSIVE_CHECK - { - int i; - for (i = 0; i < count_clusters; i++) - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); - } + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; - ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, gdp); - ext4_unlock_group(sb, block_group); + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + count_clusters, mark_flags, &changed); - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(count_clusters, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + if (err && changed == 0) + goto error_clean; - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; +#ifdef AGGRESSIVE_CHECK + BUG_ON(changed != count_clusters); +#endif /* * We need to make sure we don't reuse the freed block until after the @@ -6544,7 +6497,6 @@ do_more: block += count; count = overflow; ext4_mb_unload_buddy(&e4b); - put_bh(bitmap_bh); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; @@ -6552,7 +6504,6 @@ do_more: error_clean: ext4_mb_unload_buddy(&e4b); - brelse(bitmap_bh); error_out: ext4_std_error(sb, err); } -- cgit v1.2.3 From 03c7fc39a677cdeae076d8b9ecc10920dd13f6a8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:03 +0800 Subject: ext4: Separate block bitmap and buddy bitmap freeing in ext4_group_add_blocks() This patch separates block bitmap and buddy bitmap freeing in order to update block bitmap with ext4_mb_mark_context in following patch. The reason why this can be sperated is explained in previous submit. Put the explanation here to simplify the code archeology to ext4_group_add_blocks(): Separated freeing is safe with concurrent allocation as long as: 1. Firstly allocate block in buddy bitmap, and then in block bitmap. 2. Firstly free block in block bitmap, and then buddy bitmap. Then freed block will only be available to allocation when both buddy bitmap and block bitmap are updated by freeing. Allocation obeys rule 1 already, just do sperated freeing with rule 2. Separated freeing has no race with generate_buddy as: Once ext4_mb_load_buddy_gfp is executed successfully, the update-to-date buddy page can be found in sbi->s_buddy_cache and no more buddy initialization of the buddy page will be executed concurrently until buddy page is unloaded. As we always do free in "load buddy, free, unload buddy" sequence, separated freeing has no race with generate_buddy. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 54 ++++++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d90fb5660429..e866929a9760 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6650,35 +6650,39 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; - goto error_return; + goto error_out; + } + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_out; + + if (!ext4_sb_block_valid(sb, NULL, block, count)) { + ext4_error(sb, "Adding blocks in system zones - " + "Block = %llu, count = %lu", + block, count); + err = -EINVAL; + goto error_clean; } bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); bitmap_bh = NULL; - goto error_return; + goto error_clean; } desc = ext4_get_group_desc(sb, block_group, &gd_bh); if (!desc) { err = -EIO; - goto error_return; - } - - if (!ext4_sb_block_valid(sb, NULL, block, count)) { - ext4_error(sb, "Adding blocks in system zones - " - "Block = %llu, count = %lu", - block, count); - err = -EINVAL; - goto error_return; + goto error_clean; } BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) - goto error_return; + goto error_clean; /* * We are about to modify some metadata. Call the journal APIs @@ -6688,7 +6692,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, BUFFER_TRACE(gd_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) - goto error_return; + goto error_clean; for (i = 0, clusters_freed = 0; i < cluster_count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); @@ -6701,26 +6705,14 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } } - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - - /* - * need to update group_info->bb_free and bitmap - * with group lock held. generate_buddy look at - * them with group lock_held - */ ext4_lock_group(sb, block_group); mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); - mb_free_blocks(NULL, &e4b, bit, cluster_count); free_clusters_count = clusters_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, free_clusters_count); ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); - percpu_counter_add(&sbi->s_freeclusters_counter, - clusters_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); @@ -6729,8 +6721,6 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, flex_group)->free_clusters); } - ext4_mb_unload_buddy(&e4b); - /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); @@ -6741,8 +6731,16 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, if (!err) err = ret; -error_return: + ext4_lock_group(sb, block_group); + mb_free_blocks(NULL, &e4b, bit, cluster_count); + ext4_unlock_group(sb, block_group); + percpu_counter_add(&sbi->s_freeclusters_counter, + clusters_freed); + +error_clean: brelse(bitmap_bh); + ext4_mb_unload_buddy(&e4b); +error_out: ext4_std_error(sb, err); return err; } -- cgit v1.2.3 From 5c657db46d9ebc63e57be1408f1986c9b23368e1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:04 +0800 Subject: ext4: call ext4_mb_mark_context in ext4_group_add_blocks() Call ext4_mb_mark_context in ext4_group_add_blocks() to remove repeat code. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 82 +++++++------------------------------------------------ 1 file changed, 10 insertions(+), 72 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e866929a9760..b26e4d64cd4c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6622,23 +6622,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned int i; - struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; - int err = 0, ret, free_clusters_count; - ext4_grpblk_t clusters_freed; + int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; + ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); - if (count == 0) + if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -6665,80 +6661,22 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, goto error_clean; } - bitmap_bh = ext4_read_block_bitmap(sb, block_group); - if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto error_clean; - } - - desc = ext4_get_group_desc(sb, block_group, &gd_bh); - if (!desc) { - err = -EIO; - goto error_clean; - } - - BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, - EXT4_JTR_NONE); - if (err) - goto error_clean; - - /* - * We are about to modify some metadata. Call the journal APIs - * to unshare ->b_data if a currently-committing transaction is - * using it - */ - BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); - if (err) + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, + &changed); + if (err && changed == 0) goto error_clean; - for (i = 0, clusters_freed = 0; i < cluster_count; i++) { - BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { - ext4_error(sb, "bit already cleared for block %llu", - (ext4_fsblk_t)(block + i)); - BUFFER_TRACE(bitmap_bh, "bit already cleared"); - } else { - clusters_freed++; - } - } - - ext4_lock_group(sb, block_group); - mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); - free_clusters_count = clusters_freed + - ext4_free_group_clusters(sb, desc); - ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); - ext4_group_desc_csum_set(sb, block_group, desc); - ext4_unlock_group(sb, block_group); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); - atomic64_add(clusters_freed, - &sbi_array_rcu_deref(sbi, s_flex_groups, - flex_group)->free_clusters); - } - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); - - /* And the group descriptor block */ - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); - if (!err) - err = ret; + if (changed != cluster_count) + ext4_error(sb, "bit already cleared in group %u", block_group); ext4_lock_group(sb, block_group); mb_free_blocks(NULL, &e4b, bit, cluster_count); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, - clusters_freed); + changed); error_clean: - brelse(bitmap_bh); ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); -- cgit v1.2.3 From bdefd689b7ff0eadc3b29dc6c66556617bd1ed42 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:05 +0800 Subject: ext4: add some kunit stub for mballoc kunit test Multiblocks allocation will read and write block bitmap and group descriptor which reside on disk. Add kunit stub to function ext4_get_group_desc, ext4_read_block_bitmap_nowait, ext4_wait_block_bitmap and ext4_mb_mark_context to avoid real IO to disk. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 10 ++++++++++ fs/ext4/mballoc.c | 5 +++++ 2 files changed, 15 insertions(+) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4d08bb2bd184..591fb3f710be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -22,6 +22,7 @@ #include "mballoc.h" #include +#include static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); @@ -272,6 +273,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, + sb, block_group, bh); + if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); @@ -466,6 +470,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, ext4_fsblk_t bitmap_blk; int err; + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, + sb, block_group, ignore_locked); + desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); @@ -561,6 +568,9 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, { struct ext4_group_desc *desc; + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, + sb, block_group, bh); + if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b26e4d64cd4c..401b8329d1e1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -18,6 +18,7 @@ #include #include #include +#include /* * MUSTDO: @@ -3957,6 +3958,10 @@ ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, int err; unsigned int i, already, changed = len; + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, + handle, sb, state, group, blkoff, len, + flags, ret_changed); + if (ret_changed) *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); -- cgit v1.2.3 From 7c9fa399a369546c0e0375ea4b354d642a8fe501 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:06 +0800 Subject: ext4: add first unit test for ext4_mb_new_blocks_simple in mballoc Here are prepared work: 1. Include mballoc-test.c to mballoc.c to be able test static function in mballoc.c. 2. Implement static stub to avoid read IO to disk. 3. Construct fake super_block. Only partial members are set, more members will be set when more functions are tested. Then unit test for ext4_mb_new_blocks_simple is added. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-12-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc-test.c | 325 +++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/mballoc.c | 4 + 2 files changed, 329 insertions(+) create mode 100644 fs/ext4/mballoc-test.c diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c new file mode 100644 index 000000000000..120c4944d2e1 --- /dev/null +++ b/fs/ext4/mballoc-test.c @@ -0,0 +1,325 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test of ext4 multiblocks allocation. + */ + +#include +#include + +#include "ext4.h" + +struct mbt_grp_ctx { + struct buffer_head bitmap_bh; + /* desc and gd_bh are just the place holders for now */ + struct ext4_group_desc desc; + struct buffer_head gd_bh; +}; + +struct mbt_ctx { + struct mbt_grp_ctx *grp_ctx; +}; + +struct mbt_ext4_super_block { + struct super_block sb; + struct mbt_ctx mbt_ctx; +}; + +#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx)) +#define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) + +static struct super_block *mbt_ext4_alloc_super_block(void) +{ + struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL); + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); + + if (fsb == NULL || sbi == NULL || es == NULL) + goto out; + + sbi->s_es = es; + fsb->sb.s_fs_info = sbi; + return &fsb->sb; + +out: + kfree(fsb); + kfree(sbi); + kfree(es); + return NULL; +} + +static void mbt_ext4_free_super_block(struct super_block *sb) +{ + struct mbt_ext4_super_block *fsb = + container_of(sb, struct mbt_ext4_super_block, sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + + kfree(sbi->s_es); + kfree(sbi); + kfree(fsb); +} + +struct mbt_ext4_block_layout { + unsigned char blocksize_bits; + unsigned int cluster_bits; + uint32_t blocks_per_group; + ext4_group_t group_count; + uint16_t desc_size; +}; + +static void mbt_init_sb_layout(struct super_block *sb, + struct mbt_ext4_block_layout *layout) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + + sb->s_blocksize = 1UL << layout->blocksize_bits; + sb->s_blocksize_bits = layout->blocksize_bits; + + sbi->s_groups_count = layout->group_count; + sbi->s_blocks_per_group = layout->blocks_per_group; + sbi->s_cluster_bits = layout->cluster_bits; + sbi->s_cluster_ratio = 1U << layout->cluster_bits; + sbi->s_clusters_per_group = layout->blocks_per_group >> + layout->cluster_bits; + sbi->s_desc_size = layout->desc_size; + + es->s_first_data_block = cpu_to_le32(0); + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * + layout->group_count); +} + +static int mbt_grp_ctx_init(struct super_block *sb, + struct mbt_grp_ctx *grp_ctx) +{ + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); + if (grp_ctx->bitmap_bh.b_data == NULL) + return -ENOMEM; + + return 0; +} + +static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) +{ + kfree(grp_ctx->bitmap_bh.b_data); + grp_ctx->bitmap_bh.b_data = NULL; +} + +static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, + unsigned int start, unsigned int len) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); +} + +/* called after mbt_init_sb_layout */ +static int mbt_ctx_init(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), + GFP_KERNEL); + if (ctx->grp_ctx == NULL) + return -ENOMEM; + + for (i = 0; i < ngroups; i++) + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) + goto out; + + /* + * first data block(first cluster in first group) is used by + * metadata, mark it used to avoid to alloc data block at first + * block which will fail ext4_sb_block_valid check. + */ + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); + + return 0; +out: + while (i-- > 0) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); + return -ENOMEM; +} + +static void mbt_ctx_release(struct super_block *sb) +{ + struct mbt_ctx *ctx = MBT_CTX(sb); + ext4_group_t i, ngroups = ext4_get_groups_count(sb); + + for (i = 0; i < ngroups; i++) + mbt_grp_ctx_release(&ctx->grp_ctx[i]); + kfree(ctx->grp_ctx); +} + +static struct buffer_head * +ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, + bool ignore_locked) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ + get_bh(&grp_ctx->bitmap_bh); + return &grp_ctx->bitmap_bh; +} + +static int ext4_wait_block_bitmap_stub(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + return 0; +} + +static struct ext4_group_desc * +ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, + struct buffer_head **bh) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); + + if (bh != NULL) + *bh = &grp_ctx->gd_bh; + + return &grp_ctx->desc; +} + +static int +ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, + ext4_group_t group, ext4_grpblk_t blkoff, + ext4_grpblk_t len, int flags, + ext4_grpblk_t *ret_changed) +{ + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; + + if (state) + mb_set_bits(bitmap_bh->b_data, blkoff, len); + else + mb_clear_bits(bitmap_bh->b_data, blkoff, len); + + return 0; +} + +#define TEST_BLOCKSIZE_BITS 10 +#define TEST_CLUSTER_BITS 3 +#define TEST_BLOCKS_PER_GROUP 8192 +#define TEST_GROUP_COUNT 4 +#define TEST_DESC_SIZE 64 +#define TEST_GOAL_GROUP 1 +static int mbt_kunit_init(struct kunit *test) +{ + struct mbt_ext4_block_layout layout = { + .blocksize_bits = TEST_BLOCKSIZE_BITS, + .cluster_bits = TEST_CLUSTER_BITS, + .blocks_per_group = TEST_BLOCKS_PER_GROUP, + .group_count = TEST_GROUP_COUNT, + .desc_size = TEST_DESC_SIZE, + }; + struct super_block *sb; + int ret; + + sb = mbt_ext4_alloc_super_block(); + if (sb == NULL) + return -ENOMEM; + + mbt_init_sb_layout(sb, &layout); + + ret = mbt_ctx_init(sb); + if (ret != 0) { + mbt_ext4_free_super_block(sb); + return ret; + } + + test->priv = sb; + kunit_activate_static_stub(test, + ext4_read_block_bitmap_nowait, + ext4_read_block_bitmap_nowait_stub); + kunit_activate_static_stub(test, + ext4_wait_block_bitmap, + ext4_wait_block_bitmap_stub); + kunit_activate_static_stub(test, + ext4_get_group_desc, + ext4_get_group_desc_stub); + kunit_activate_static_stub(test, + ext4_mb_mark_context, + ext4_mb_mark_context_stub); + return 0; +} + +static void mbt_kunit_exit(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + + mbt_ctx_release(sb); + mbt_ext4_free_super_block(sb); +} + +static void test_new_blocks_simple(struct kunit *test) +{ + struct super_block *sb = (struct super_block *)test->priv; + struct inode inode = { .i_sb = sb, }; + struct ext4_allocation_request ar; + ext4_group_t i, goal_group = TEST_GOAL_GROUP; + int err = 0; + ext4_fsblk_t found; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + ar.inode = &inode; + + /* get block at goal */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, + "failed to alloc block at goal, expected %llu found %llu", + ar.goal, found); + + /* get block after goal in goal group */ + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, + "failed to alloc block after goal in goal group, expected %llu found %llu", + ar.goal + 1, found); + + /* get block after goal group */ + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, goal_group + 1), found, + "failed to alloc block after goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, goal_group + 1), found); + + /* get block before goal group */ + for (i = goal_group; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_EQ_MSG(test, + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, + "failed to alloc block before goal group, expected %llu found %llu", + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); + + /* no block available, fail to allocate block */ + for (i = 0; i < ext4_get_groups_count(sb); i++) + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); + ar.goal = ext4_group_first_block_no(sb, goal_group); + found = ext4_mb_new_blocks_simple(&ar, &err); + KUNIT_ASSERT_NE_MSG(test, err, 0, + "unexpectedly get block when no block is available"); +} + + +static struct kunit_case mbt_test_cases[] = { + KUNIT_CASE(test_new_blocks_simple), + {} +}; + +static struct kunit_suite mbt_test_suite = { + .name = "ext4_mballoc_test", + .init = mbt_kunit_init, + .exit = mbt_kunit_exit, + .test_cases = mbt_test_cases, +}; + +kunit_test_suites(&mbt_test_suite); + +MODULE_LICENSE("GPL"); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 401b8329d1e1..454d5612641e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6991,3 +6991,7 @@ out_unload: return error; } + +#ifdef CONFIG_EXT4_KUNIT_TESTS +#include "mballoc-test.c" +#endif -- cgit v1.2.3 From 28b95ee868072f1cd029245bf96cf02844322f37 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 29 Sep 2023 00:04:07 +0800 Subject: ext4: run mballoc test with different layouts setting Use KUNIT_CASE_PARAM to run mballoc test with different layouts setting. Signed-off-by: Kemeng Shi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20230928160407.142069-13-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc-test.c | 52 ++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index 120c4944d2e1..f94901fd3835 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -199,21 +199,11 @@ ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, return 0; } -#define TEST_BLOCKSIZE_BITS 10 -#define TEST_CLUSTER_BITS 3 -#define TEST_BLOCKS_PER_GROUP 8192 -#define TEST_GROUP_COUNT 4 -#define TEST_DESC_SIZE 64 #define TEST_GOAL_GROUP 1 static int mbt_kunit_init(struct kunit *test) { - struct mbt_ext4_block_layout layout = { - .blocksize_bits = TEST_BLOCKSIZE_BITS, - .cluster_bits = TEST_CLUSTER_BITS, - .blocks_per_group = TEST_BLOCKS_PER_GROUP, - .group_count = TEST_GROUP_COUNT, - .desc_size = TEST_DESC_SIZE, - }; + struct mbt_ext4_block_layout *layout = + (struct mbt_ext4_block_layout *)(test->param_value); struct super_block *sb; int ret; @@ -221,7 +211,7 @@ static int mbt_kunit_init(struct kunit *test) if (sb == NULL) return -ENOMEM; - mbt_init_sb_layout(sb, &layout); + mbt_init_sb_layout(sb, layout); ret = mbt_ctx_init(sb); if (ret != 0) { @@ -307,9 +297,43 @@ static void test_new_blocks_simple(struct kunit *test) "unexpectedly get block when no block is available"); } +static const struct mbt_ext4_block_layout mbt_test_layouts[] = { + { + .blocksize_bits = 10, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 12, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, + { + .blocksize_bits = 16, + .cluster_bits = 3, + .blocks_per_group = 8192, + .group_count = 4, + .desc_size = 64, + }, +}; + +static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, + char *desc) +{ + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " + "blocks_per_group=%d group_count=%d desc_size=%d\n", + layout->blocksize_bits, layout->cluster_bits, + layout->blocks_per_group, layout->group_count, + layout->desc_size); +} +KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); static struct kunit_case mbt_test_cases[] = { - KUNIT_CASE(test_new_blocks_simple), + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), {} }; -- cgit v1.2.3 From ce56d21355cd6f6937aca32f1f44ca749d1e4808 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 2 Oct 2023 14:50:20 -0400 Subject: ext4: fix racy may inline data check in dio write syzbot reports that the following warning from ext4_iomap_begin() triggers as of the commit referenced below: if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; This occurs during a dio write, which is never expected to encounter an inode with inline data. To enforce this behavior, ext4_dio_write_iter() checks the current inline state of the inode and clears the MAY_INLINE_DATA state flag to either fall back to buffered writes, or enforce that any other writers in progress on the inode are not allowed to create inline data. The problem is that the check for existing inline data and the state flag can span a lock cycle. For example, if the ilock is originally locked shared and subsequently upgraded to exclusive, another writer may have reacquired the lock and created inline data before the dio write task acquires the lock and proceeds. The commit referenced below loosens the lock requirements to allow some forms of unaligned dio writes to occur under shared lock, but AFAICT the inline data check was technically already racy for any dio write that would have involved a lock cycle. Regardless, lift clearing of the state bit to the same lock critical section that checks for preexisting inline data on the inode to close the race. Cc: stable@kernel.org Reported-by: syzbot+307da6ca5cb0d01d581a@syzkaller.appspotmail.com Fixes: 310ee0902b8d ("ext4: allow concurrent unaligned dio overwrites") Signed-off-by: Brian Foster Link: https://lore.kernel.org/r/20231002185020.531537-1-bfoster@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6830ea3a6c59..747c0378122d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -569,18 +569,20 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) return ext4_buffered_write_iter(iocb, from); } + /* + * Prevent inline data from being created since we are going to allocate + * blocks for DIO. We know the inode does not currently have inline data + * because ext4_should_use_dio() checked for it, but we have to clear + * the state flag before the write checks because a lock cycle could + * introduce races with other writers. + */ + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; - /* - * Make sure inline data cannot be created anymore since we are going - * to allocate blocks for DIO. We know the inode does not have any - * inline data now because ext4_dio_supported() checked for that. - */ - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - offset = iocb->ki_pos; count = ret; -- cgit v1.2.3 From 91562895f8030cb9a0470b1db49de79346a69f91 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 Oct 2023 14:13:50 +0200 Subject: ext4: properly sync file size update after O_SYNC direct IO Gao Xiang has reported that on ext4 O_SYNC direct IO does not properly sync file size update and thus if we crash at unfortunate moment, the file can have smaller size although O_SYNC IO has reported successful completion. The problem happens because update of on-disk inode size is handled in ext4_dio_write_iter() *after* iomap_dio_rw() (and thus dio_complete() in particular) has returned and generic_file_sync() gets called by dio_complete(). Fix the problem by handling on-disk inode size update directly in our ->end_io completion handler. References: https://lore.kernel.org/all/02d18236-26ef-09b0-90ad-030c4fe3ee20@linux.alibaba.com Reported-by: Gao Xiang CC: stable@vger.kernel.org Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure") Signed-off-by: Jan Kara Tested-by: Joseph Qi Reviewed-by: "Ritesh Harjani (IBM)" Link: https://lore.kernel.org/r/20231013121350.26872-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 153 ++++++++++++++++++++++++--------------------------------- 1 file changed, 65 insertions(+), 88 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 747c0378122d..0166bb9ca160 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -306,80 +306,38 @@ out: } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, - ssize_t written, size_t count) + ssize_t count) { handle_t *handle; - bool truncate = false; - u8 blkbits = inode->i_blkbits; - ext4_lblk_t written_blk, end_blk; - int ret; - - /* - * Note that EXT4_I(inode)->i_disksize can get extended up to - * inode->i_size while the I/O was running due to writeback of delalloc - * blocks. But, the code in ext4_iomap_alloc() is careful to use - * zeroed/unwritten extents if this is possible; thus we won't leave - * uninitialized blocks in a file even if we didn't succeed in writing - * as much as we intended. - */ - WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); - if (offset + count <= EXT4_I(inode)->i_disksize) { - /* - * We need to ensure that the inode is removed from the orphan - * list if it has been added prematurely, due to writeback of - * delalloc blocks. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); - } - - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - } - - return written; - } - - if (written < 0) - goto truncate; + lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - written = PTR_ERR(handle); - goto truncate; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); - if (ext4_update_inode_size(inode, offset + written)) { - ret = ext4_mark_inode_dirty(handle, inode); + if (ext4_update_inode_size(inode, offset + count)) { + int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { - written = ret; ext4_journal_stop(handle); - goto truncate; + return ret; } } - /* - * We may need to truncate allocated but not written blocks beyond EOF. - */ - written_blk = ALIGN(offset + written, 1 << blkbits); - end_blk = ALIGN(offset + count, 1 << blkbits); - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - - /* - * Remove the inode from the orphan list if it has been extended and - * everything went OK. - */ - if (!truncate && inode->i_nlink) + if (inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - if (truncate) { -truncate: + return count; +} + +/* + * Clean up the inode after DIO or DAX extending write has completed and the + * inode size has been updated using ext4_handle_inode_extension(). + */ +static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) +{ + lockdep_assert_held_write(&inode->i_rwsem); + if (count < 0) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may @@ -388,9 +346,28 @@ truncate: */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); + return; } + /* + * If i_disksize got extended due to writeback of delalloc blocks while + * the DIO was running we could fail to cleanup the orphan list in + * ext4_handle_inode_extension(). Do it now. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - return written; + if (IS_ERR(handle)) { + /* + * The write has successfully completed. Not much to + * do with the error here so just cleanup the orphan + * list and hope for the best. + */ + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, @@ -399,31 +376,22 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); + if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; - - if (size && flags & IOMAP_DIO_UNWRITTEN) { - error = ext4_convert_unwritten_extents(NULL, inode, pos, size); - if (error < 0) - return error; - } /* - * If we are extending the file, we have to update i_size here before - * page cache gets invalidated in iomap_dio_rw(). Otherwise racing - * buffered reads could zero out too much from page cache pages. Update - * of on-disk size will happen later in ext4_dio_write_iter() where - * we have enough information to also perform orphan list handling etc. - * Note that we perform all extending writes synchronously under - * i_rwsem held exclusively so i_size update is safe here in that case. - * If the write was not extending, we cannot see pos > i_size here - * because operations reducing i_size like truncate wait for all - * outstanding DIO before updating i_size. + * Note that EXT4_I(inode)->i_disksize can get extended up to + * inode->i_size while the I/O was running due to writeback of delalloc + * blocks. But the code in ext4_iomap_alloc() is careful to use + * zeroed/unwritten extents if this is possible; thus we won't leave + * uninitialized blocks in a file even if we didn't succeed in writing + * as much as we intended. */ - pos += size; - if (pos > i_size_read(inode)) - i_size_write(inode, pos); - - return 0; + WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize)); + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize)) + return size; + return ext4_handle_inode_extension(inode, pos, size); } static const struct iomap_dio_ops ext4_dio_write_ops = { @@ -608,9 +576,16 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; - - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + /* + * We always perform extending DIO write synchronously so by + * now the IO is completed and ext4_handle_inode_extension() + * was called. Cleanup the inode in case of error or race with + * writeback of delalloc blocks. + */ + WARN_ON_ONCE(ret == -EIOCBQUEUED); + ext4_inode_extension_cleanup(inode, ret); + } out: if (ilock_shared) @@ -691,8 +666,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + ret = ext4_handle_inode_extension(inode, offset, ret); + ext4_inode_extension_cleanup(inode, ret); + } out: inode_unlock(inode); if (ret > 0) -- cgit v1.2.3