From 2dcf5fde6dffb312a4bfb8ef940cea2d1f402e32 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Mon, 27 Nov 2023 14:33:13 +0800 Subject: ext4: prevent the normalized size from exceeding EXT_MAX_BLOCKS For files with logical blocks close to EXT_MAX_BLOCKS, the file size predicted in ext4_mb_normalize_request() may exceed EXT_MAX_BLOCKS. This can cause some blocks to be preallocated that will not be used. And after [Fixes], the following issue may be triggered: ========================================================= kernel BUG at fs/ext4/mballoc.c:4653! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP CPU: 1 PID: 2357 Comm: xfs_io 6.7.0-rc2-00195-g0f5cc96c367f Hardware name: linux,dummy-virt (DT) pc : ext4_mb_use_inode_pa+0x148/0x208 lr : ext4_mb_use_inode_pa+0x98/0x208 Call trace: ext4_mb_use_inode_pa+0x148/0x208 ext4_mb_new_inode_pa+0x240/0x4a8 ext4_mb_use_best_found+0x1d4/0x208 ext4_mb_try_best_found+0xc8/0x110 ext4_mb_regular_allocator+0x11c/0xf48 ext4_mb_new_blocks+0x790/0xaa8 ext4_ext_map_blocks+0x7cc/0xd20 ext4_map_blocks+0x170/0x600 ext4_iomap_begin+0x1c0/0x348 ========================================================= Here is a calculation when adjusting ac_b_ex in ext4_mb_new_inode_pa(): ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; The problem is that when orig_goal_end is subtracted from ac_b_ex.fe_len it is still greater than EXT_MAX_BLOCKS, which causes ex.fe_logical to overflow to a very small value, which ultimately triggers a BUG_ON in ext4_mb_new_inode_pa() because pa->pa_free < len. The last logical block of an actual write request does not exceed EXT_MAX_BLOCKS, so in ext4_mb_normalize_request() also avoids normalizing the last logical block to exceed EXT_MAX_BLOCKS to avoid the above issue. The test case in [Link] can reproduce the above issue with 64k block size. Link: https://patchwork.kernel.org/project/fstests/list/?series=804003 Cc: # 6.4 Fixes: 93cdf49f6eca ("ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa()") Signed-off-by: Baokun Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231127063313.3734294-1-libaokun1@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 454d5612641e..d72b5e3c92ec 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4478,6 +4478,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); + /* avoid unnecessary preallocation that may trigger assertions */ + if (start + size > EXT_MAX_BLOCKS) + size = EXT_MAX_BLOCKS - start; + /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; -- cgit v1.2.3 From 85559227211020b270728104c3b89918f7af27ac Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Nov 2023 19:47:39 +0800 Subject: jbd2: correct the printing of write_flags in jbd2_write_superblock() The write_flags print in the trace of jbd2_write_superblock() is not real, so move the modification before the trace. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231129114740.2686201-1-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/journal.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ed53188472f9..71b30f6a662d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1798,9 +1798,11 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) return -EIO; } - trace_jbd2_write_superblock(journal, write_flags); if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); + + trace_jbd2_write_superblock(journal, write_flags); + if (buffer_write_io_error(bh)) { /* * Oh, dear. A previous attempt to write the journal -- cgit v1.2.3 From 6a3afb6ac6dfab158ebdd4b87941178f58c8939f Mon Sep 17 00:00:00 2001 From: Zhang Yi Date: Wed, 29 Nov 2023 19:47:40 +0800 Subject: jbd2: increase the journal IO's priority Current jbd2 only add REQ_SYNC for descriptor block, metadata log buffer, commit buffer and superblock buffer, the submitted IO could be throttled by writeback throttle in block layer, that could lead to priority inversion in some cases. The log IO looks like a kind of high priority metadata IO, so it should not be throttled by WBT like QOS policies in block layer, let's add REQ_SYNC | REQ_IDLE to exempt from writeback throttle, and also add REQ_META together indicates it's a metadata IO. Signed-off-by: Zhang Yi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231129114740.2686201-2-yi.zhang@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 9 +++++---- fs/jbd2/journal.c | 20 +++++++++++--------- include/linux/jbd2.h | 3 +++ 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8d6f934c3d95..9bdb377a348f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -119,7 +119,7 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; struct timespec64 now; - blk_opf_t write_flags = REQ_OP_WRITE | REQ_SYNC; + blk_opf_t write_flags = REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS; *cbh = NULL; @@ -395,8 +395,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - REQ_SYNC); + journal->j_tail, 0); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd2_debug(3, "superblock not updated\n"); @@ -715,6 +714,7 @@ start_journal_io: for (i = 0; i < bufs; i++) { struct buffer_head *bh = wbuf[i]; + /* * Compute checksum. */ @@ -727,7 +727,8 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE | REQ_SYNC, bh); + submit_bh(REQ_OP_WRITE | JBD2_JOURNAL_REQ_FLAGS, + bh); } cond_resched(); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 71b30f6a662d..206cb53ef2b0 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1100,8 +1100,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, - REQ_SYNC | REQ_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; @@ -1775,8 +1774,7 @@ static int journal_reset(journal_t *journal) */ jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, - journal->j_tail, - REQ_SYNC | REQ_FUA); + journal->j_tail, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1798,6 +1796,11 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) return -EIO; } + /* + * Always set high priority flags to exempt from block layer's + * QOS policies, e.g. writeback throttle. + */ + write_flags |= JBD2_JOURNAL_REQ_FLAGS; if (!(journal->j_flags & JBD2_BARRIER)) write_flags &= ~(REQ_FUA | REQ_PREFLUSH); @@ -2052,7 +2055,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) jbd2_debug(1, "JBD2: updating superblock error (errno %d)\n", errcode); sb->s_errno = cpu_to_be32(errcode); - jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); + jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -2173,8 +2176,7 @@ int jbd2_journal_destroy(journal_t *journal) ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); - jbd2_mark_journal_empty(journal, - REQ_SYNC | REQ_PREFLUSH | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -2475,7 +2477,7 @@ int jbd2_journal_flush(journal_t *journal, unsigned int flags) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); if (flags) err = __jbd2_journal_erase(journal, flags); @@ -2521,7 +2523,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock_io(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 6dcbb4eb80fb..beb30719ee16 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1374,6 +1374,9 @@ JBD2_FEATURE_INCOMPAT_FUNCS(csum2, CSUM_V2) JBD2_FEATURE_INCOMPAT_FUNCS(csum3, CSUM_V3) JBD2_FEATURE_INCOMPAT_FUNCS(fast_commit, FAST_COMMIT) +/* Journal high priority write IO operation flags */ +#define JBD2_JOURNAL_REQ_FLAGS (REQ_META | REQ_SYNC | REQ_IDLE) + /* * Journal flag definitions */ -- cgit v1.2.3 From 619f75dae2cf117b1d07f27b046b9ffb071c4685 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Nov 2023 10:56:53 +0100 Subject: ext4: fix warning in ext4_dio_write_end_io() The syzbot has reported that it can hit the warning in ext4_dio_write_end_io() because i_size < i_disksize. Indeed the reproducer creates a race between DIO IO completion and truncate expanding the file and thus ext4_dio_write_end_io() sees an inconsistent inode state where i_disksize is already updated but i_size is not updated yet. Since we are careful when setting up DIO write and consider it extending (and thus performing the IO synchronously with i_rwsem held exclusively) whenever it goes past either of i_size or i_disksize, we can use the same test during IO completion without risking entering ext4_handle_inode_extension() without i_rwsem held. This way we make it obvious both i_size and i_disksize are large enough when we report DIO completion without relying on unreliable WARN_ON. Reported-by: Fixes: 91562895f803 ("ext4: properly sync file size update after O_SYNC direct IO") Signed-off-by: Jan Kara Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20231130095653.22679-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0166bb9ca160..6aa15dafc677 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -349,9 +349,10 @@ static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) return; } /* - * If i_disksize got extended due to writeback of delalloc blocks while - * the DIO was running we could fail to cleanup the orphan list in - * ext4_handle_inode_extension(). Do it now. + * If i_disksize got extended either due to writeback of delalloc + * blocks or extending truncate while the DIO was running we could fail + * to cleanup the orphan list in ext4_handle_inode_extension(). Do it + * now. */ if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); @@ -386,10 +387,11 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing - * as much as we intended. + * as much as we intended. Also we can race with truncate or write + * expanding the file so we have to be a bit careful here. */ - WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize)); - if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize)) + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && + pos + size <= i_size_read(inode)) return size; return ext4_handle_inode_extension(inode, pos, size); } -- cgit v1.2.3 From 6c02757c936063f0631b4e43fe156f8c8f1f351f Mon Sep 17 00:00:00 2001 From: Ye Bin Date: Mon, 11 Dec 2023 19:25:44 +0800 Subject: jbd2: fix soft lockup in journal_finish_inode_data_buffers() There's issue when do io test: WARN: soft lockup - CPU#45 stuck for 11s! [jbd2/dm-2-8:4170] CPU: 45 PID: 4170 Comm: jbd2/dm-2-8 Kdump: loaded Tainted: G OE Call trace: dump_backtrace+0x0/0x1a0 show_stack+0x24/0x30 dump_stack+0xb0/0x100 watchdog_timer_fn+0x254/0x3f8 __hrtimer_run_queues+0x11c/0x380 hrtimer_interrupt+0xfc/0x2f8 arch_timer_handler_phys+0x38/0x58 handle_percpu_devid_irq+0x90/0x248 generic_handle_irq+0x3c/0x58 __handle_domain_irq+0x68/0xc0 gic_handle_irq+0x90/0x320 el1_irq+0xcc/0x180 queued_spin_lock_slowpath+0x1d8/0x320 jbd2_journal_commit_transaction+0x10f4/0x1c78 [jbd2] kjournald2+0xec/0x2f0 [jbd2] kthread+0x134/0x138 ret_from_fork+0x10/0x18 Analyzed informations from vmcore as follows: (1) There are about 5k+ jbd2_inode in 'commit_transaction->t_inode_list'; (2) Now is processing the 855th jbd2_inode; (3) JBD2 task has TIF_NEED_RESCHED flag; (4) There's no pags in address_space around the 855th jbd2_inode; (5) There are some process is doing drop caches; (6) Mounted with 'nodioread_nolock' option; (7) 128 CPUs; According to informations from vmcore we know 'journal->j_list_lock' spin lock competition is fierce. So journal_finish_inode_data_buffers() maybe process slowly. Theoretically, there is scheduling point in the filemap_fdatawait_range_keep_errors(). However, if inode's address_space has no pages which taged with PAGECACHE_TAG_WRITEBACK, will not call cond_resched(). So may lead to soft lockup. journal_finish_inode_data_buffers filemap_fdatawait_range_keep_errors __filemap_fdatawait_range while (index <= end) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, PAGECACHE_TAG_WRITEBACK); if (!nr_pages) break; --> If 'nr_pages' is equal zero will break, then will not call cond_resched() for (i = 0; i < nr_pages; i++) wait_on_page_writeback(page); cond_resched(); To solve above issue, add scheduling point in the journal_finish_inode_data_buffers(); Signed-off-by: Ye Bin Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20231211112544.3879780-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o --- fs/jbd2/commit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 9bdb377a348f..5e122586e06e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -270,6 +270,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal, if (!ret) ret = err; } + cond_resched(); spin_lock(&journal->j_list_lock); jinode->i_flags &= ~JI_COMMIT_RUNNING; smp_mb(); -- cgit v1.2.3