From 9462f770eda85b754e82b089d8a0d195fa160837 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:13 -0500 Subject: ext4: Update stale comment about write constraints The comment above do_journal_get_write_access() is very stale. Most of it just does not refer to what the function does today or how jbd2 works. The bit about transaction handling during write(2) is still correct so just update the function names in that part and move the comment to a more appropriate place. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-2-tytso@mit.edu --- fs/ext4/inode.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf0b7dea4900..d27ef74b9d6f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1004,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, return ret; } -/* - * To preserve ordering, it is essential that the hole instantiation and - * the data write be encapsulated in a single transaction. We cannot - * close off a transaction and start a new one between the ext4_get_block() - * and the commit_write(). So doing the jbd2_journal_start at the start of - * prepare_write() is the right place. - * - * Also, this function can nest inside ext4_writepage(). In that case, we - * *know* that ext4_writepage() has generated enough buffer credits to do the - * whole page. So we won't block on the journal in that case, which is good, - * because the caller may be PF_MEMALLOC. - * - * By accident, ext4 can be reentered when a transaction is open via - * quota file writes. If we were to commit the transaction while thus - * reentered, there can be a deadlock - we would be holding a quota - * lock, and the commit would never complete if another thread had a - * transaction open and was blocking on the quota lock - a ranking - * violation. - * - * So what we do is to rely on the fact that jbd2_journal_stop/journal_start - * will _not_ run commit under these circumstances because handle->h_ref - * is elevated. We'll still have enough credits for the tiny quotafile - * write. - */ int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1149,6 +1125,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } #endif +/* + * To preserve ordering, it is essential that the hole instantiation and + * the data write be encapsulated in a single transaction. We cannot + * close off a transaction and start a new one between the ext4_get_block() + * and the ext4_write_end(). So doing the jbd2_journal_start at the start of + * ext4_write_begin() is the right place. + */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) -- cgit v1.2.3 From c8e8e16dbbf0840e6f41575c3d6158bd331218bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:14 -0500 Subject: ext4: Use nr_to_write directly in mpage_prepare_extent_to_map() When looking up extent of pages to map in mpage_prepare_extent_to_map() we count how many pages we still need to find in a copy of wbc->nr_to_write counter. With more complex page handling for data=journal mode, it will be easier to use wbc->nr_to_write directly so that we don't forget to carry over changes back to nr_to_write counter. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-3-tytso@mit.edu --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d27ef74b9d6f..ff913d0cd4b6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2580,7 +2580,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; - long left = mpd->wbc->nr_to_write; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; @@ -2613,7 +2612,9 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ - if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) + if (mpd->wbc->sync_mode == WB_SYNC_NONE && + mpd->wbc->nr_to_write <= + mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ @@ -2682,7 +2683,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) goto out; err = 0; } - left -= folio_nr_pages(folio); } folio_batch_release(&fbatch); cond_resched(); -- cgit v1.2.3 From 3f5d30636d2a188ee3cd22c6fef1ace5304a07bf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:15 -0500 Subject: ext4: Mark page for delayed dirtying only if it is pinned In data=journal mode, page should be dirtied only when it has buffers for checkpoint or it is writeably mapped. In the first case, we don't need to do anything special. In the second case, page was already added to the journal by ext4_page_mkwrite() and since transaction commit writeprotects mapped pages again, page should be writeable (and thus dirtied) only while it is part of the running transaction. So nothing needs to be done either. The only special case is when someone pins the page and uses this pin for modifying page data. So recognize this special case and only then mark the page as having data that needs adding to the journal. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-4-tytso@mit.edu --- fs/ext4/inode.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index ff913d0cd4b6..118eb674038a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3669,24 +3669,26 @@ const struct iomap_ops ext4_iomap_report_ops = { }; /* - * Whenever the folio is being dirtied, corresponding buffers should already - * be attached to the transaction (we take care of this in ext4_page_mkwrite() - * and ext4_write_begin()). However we cannot move buffers to dirty transaction - * lists here because ->dirty_folio is called under VFS locks and the folio - * is not necessarily locked. - * - * We cannot just dirty the folio and leave attached buffers clean, because the - * buffers' dirty state is "definitive". We cannot just set the buffers dirty - * or jbddirty because all the journalling code will explode. - * - * So what we do is to mark the folio "pending dirty" and next time writepage - * is called, propagate that into the buffers appropriately. + * For data=journal mode, folio should be marked dirty only when it was + * writeably mapped. When that happens, it was already attached to the + * transaction and marked as jbddirty (we take care of this in + * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings + * so we should have nothing to do here, except for the case when someone + * had the page pinned and dirtied the page through this pin (e.g. by doing + * direct IO to it). In that case we'd need to attach buffers here to the + * transaction but we cannot due to lock ordering. We cannot just dirty the + * folio and leave attached buffers clean, because the buffers' dirty state is + * "definitive". We cannot just set the buffers dirty or jbddirty because all + * the journalling code will explode. So what we do is to mark the folio + * "pending dirty" and next time ext4_writepages() is called, attach buffers + * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); - folio_set_checked(folio); + if (folio_maybe_dma_pinned(folio)) + folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } -- cgit v1.2.3 From f1496362e9d7b37fe6b8983086c1548a601b5594 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:16 -0500 Subject: ext4: Don't unlock page in ext4_bio_write_page() Do not unlock the written page in ext4_bio_write_page(). Instead leave the page locked and unlock it in the callers. We'll need to keep the page locked for data=journal writeback for a bit longer. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-5-tytso@mit.edu --- fs/ext4/inode.c | 2 ++ fs/ext4/page-io.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 118eb674038a..fcaa2a7a27a5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2076,6 +2076,7 @@ static int ext4_writepage(struct page *page, return -ENOMEM; } ret = ext4_bio_write_page(&io_submit, page, len); + unlock_page(page); ext4_io_submit(&io_submit); /* Drop io_end reference we got from init */ ext4_put_io_end_defer(io_submit.io_end); @@ -2110,6 +2111,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len); + unlock_page(page); if (!err) mpd->wbc->nr_to_write--; mpd->first_page++; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 1e4db96a04e6..8703fd732abb 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -502,7 +502,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Nothing to submit? Just unlock the page... */ if (!nr_to_submit) - goto unlock; + return 0; bh = head = page_buffers(page); @@ -550,7 +550,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } bh = bh->b_this_page; } while (bh != head); - goto unlock; + + return ret; } } @@ -565,7 +566,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, continue; io_submit_add_bh(io, inode, page, bounce_page, bh); } while ((bh = bh->b_this_page) != head); -unlock: - unlock_page(page); - return ret; + + return 0; } -- cgit v1.2.3 From eaf2ca10ca4ba450f8e514cb8bfc9149660b57f6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:17 -0500 Subject: ext4: Move page unlocking out of mpage_submit_page() Move page unlocking during page writeback out of mpage_submit_page() into the callers. This will allow writeback in data=journal mode to keep the page locked for a bit longer. Since page unlocking it tightly connected to increment of mpd->first_page (as that determines cleanup of locked but unwritten pages), move page unlocking as well as mpd->first_page handling into a helper function. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-6-tytso@mit.edu --- fs/ext4/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fcaa2a7a27a5..85951da08d60 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2083,6 +2083,12 @@ static int ext4_writepage(struct page *page, return ret; } +static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) +{ + mpd->first_page++; + unlock_page(page); +} + static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; @@ -2111,10 +2117,8 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) else len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len); - unlock_page(page); if (!err) mpd->wbc->nr_to_write--; - mpd->first_page++; return err; } @@ -2226,6 +2230,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_page(mpd, head->b_page); + mpage_page_done(mpd, head->b_page); if (err < 0) return err; } @@ -2357,6 +2362,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); + mpage_page_done(mpd, page); if (err < 0) goto out; } @@ -2666,14 +2672,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { + if (ext4_page_nomap_can_writeout(&folio->page)) err = mpage_submit_page(mpd, &folio->page); - if (err < 0) - goto out; - } else { - folio_unlock(folio); - mpd->first_page += folio_nr_pages(folio); - } + mpage_page_done(mpd, &folio->page); + if (err < 0) + goto out; } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From d8be7607de039e5a477b0a5c63959d81ac052c3b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:18 -0500 Subject: ext4: Move mpage_page_done() calls after error handling In case mpage_submit_page() returns error, it doesn't really matter whether we call mpage_page_done() and then return error or whether we return directly because in that case page cleanup will be done by mpage_release_unused_pages() instead. Logically, it makes more sense to leave the cleanup to mpage_release_unused_pages() because we didn't succeed in writing the page. So move mpage_page_done() calls after the error handling. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-7-tytso@mit.edu --- fs/ext4/inode.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 85951da08d60..472f6b914f48 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2230,9 +2230,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_page(mpd, head->b_page); - mpage_page_done(mpd, head->b_page); if (err < 0) return err; + mpage_page_done(mpd, head->b_page); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2362,9 +2362,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); - mpage_page_done(mpd, page); if (err < 0) goto out; + mpage_page_done(mpd, page); } folio_batch_release(&fbatch); } @@ -2672,11 +2672,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * modify metadata is simple. Just submit the page. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) + if (ext4_page_nomap_can_writeout(&folio->page)) { err = mpage_submit_page(mpd, &folio->page); + if (err < 0) + goto out; + } mpage_page_done(mpd, &folio->page); - if (err < 0) - goto out; } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From 3f079114bf522f27f3680238b6429f3dd45535b6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 28 Feb 2023 00:13:19 -0500 Subject: ext4: Convert data=journal writeback to use ext4_writepages() Add support for writeback of journalled data directly into ext4_writepages() instead of offloading it to write_cache_pages(). This actually significantly simplifies the code and reduces code duplication. For checkpointing of committed data we can use ext4_writepages() rightaway the same way as writeback of ordered data uses it on transaction commit. For journalling of dirty mapped pages, we need to add a special case to mpage_prepare_extent_to_map() to add all page buffers to the journal. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230228051319.4085470-8-tytso@mit.edu --- fs/ext4/inode.c | 341 ++++++++++++-------------------------------- include/trace/events/ext4.h | 7 - 2 files changed, 91 insertions(+), 257 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 472f6b914f48..652efb2221bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int __ext4_journalled_writepage(struct page *page, unsigned int len); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -1632,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, - struct buffer_head *bh) -{ - return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); -} - /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block @@ -1870,219 +1863,6 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int __ext4_journalled_writepage(struct page *page, - unsigned int len) -{ - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; - handle_t *handle = NULL; - int ret = 0, err = 0; - int inline_data = ext4_has_inline_data(inode); - struct buffer_head *inode_bh = NULL; - loff_t size; - - ClearPageChecked(page); - - if (inline_data) { - BUG_ON(page->index != 0); - BUG_ON(len > ext4_get_max_inline_size(inode)); - inode_bh = ext4_journalled_write_inline_data(inode, len, page); - if (inode_bh == NULL) - goto out; - } - /* - * We need to release the page lock before we start the - * journal, so grab a reference so the page won't disappear - * out from under us. - */ - get_page(page); - unlock_page(page); - - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_writepage_trans_blocks(inode)); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - put_page(page); - goto out_no_pagelock; - } - BUG_ON(!ext4_handle_valid(handle)); - - lock_page(page); - put_page(page); - size = i_size_read(inode); - if (page->mapping != mapping || page_offset(page) > size) { - /* The page got truncated from under us */ - ext4_journal_stop(handle); - ret = 0; - goto out; - } - - if (inline_data) { - ret = ext4_mark_inode_dirty(handle, inode); - } else { - struct buffer_head *page_bufs = page_buffers(page); - - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, do_journal_get_write_access); - - err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, - NULL, write_end_fn); - } - if (ret == 0) - ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); - if (ret == 0) - ret = err; - EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - err = ext4_journal_stop(handle); - if (!ret) - ret = err; - - ext4_set_inode_state(inode, EXT4_STATE_JDATA); -out: - unlock_page(page); -out_no_pagelock: - brelse(inode_bh); - return ret; -} - -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), no one guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * This function can get called via... - * - ext4_writepages after taking page lock (have journal handle) - * - journal_submit_inode_data_buffers (no journal handle) - * - shrink_page_list via the kswapd/direct reclaim (no journal handle) - * - grab_page_cache when doing write_begin (have journal handle) - * - * We don't do any block allocation in this function. If we have page with - * multiple blocks we need to write those buffer_heads that are mapped. This - * is important for mmaped based write. So if we do with blocksize 1K - * truncate(f, 1024); - * a = mmap(f, 0, 4096); - * a[0] = 'a'; - * truncate(f, 4096); - * we have in the page first buffer_head mapped via page_mkwrite call back - * but other buffer_heads would be unmapped but dirty (dirty done via the - * do_wp_page). So writepage should write the first block. If we modify - * the mmap area beyond 1024 we will again get a page_fault and the - * page_mkwrite callback will do the block allocation and mark the - * buffer_heads mapped. - * - * We redirty the page if we have any buffer_heads that is either delay or - * unwritten in the page. - * - * We can get recursively called as show below. - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * But since we don't do any block allocation we should not deadlock. - * Page also have the dirty flag cleared so we don't get recurive page_lock. - */ -static int ext4_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); - int ret = 0; - loff_t size; - unsigned int len; - struct buffer_head *page_bufs = NULL; - struct inode *inode = page->mapping->host; - struct ext4_io_submit io_submit; - - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { - folio_invalidate(folio, 0, folio_size(folio)); - folio_unlock(folio); - return -EIO; - } - - trace_ext4_writepage(page); - size = i_size_read(inode); - if (page->index == size >> PAGE_SHIFT && - !ext4_verity_in_progress(inode)) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - - /* Should never happen but for bugs in other kernel subsystems */ - if (!page_has_buffers(page)) { - ext4_warning_inode(inode, - "page %lu does not have buffers attached", page->index); - ClearPageDirty(page); - unlock_page(page); - return 0; - } - - page_bufs = page_buffers(page); - /* - * We cannot do block allocation or other extent handling in this - * function. If there are buffers needing that, we have to redirty - * the page. But we may reach here when we do a journal commit via - * journal_submit_inode_data_buffers() and in that case we must write - * allocated buffers to achieve data=ordered mode guarantees. - * - * Also, if there is only one buffer per page (the fs block - * size == the page size), if one buffer needs block - * allocation or needs to modify the extent tree to clear the - * unwritten flag, we know that the page can't be written at - * all, so we might as well refuse the write immediately. - * Unfortunately if the block size != page size, we can't as - * easily detect this case using ext4_walk_page_buffers(), but - * for the extremely common case, this is an optimization that - * skips a useless round trip through ext4_bio_write_page(). - */ - if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, - ext4_bh_delay_or_unwritten)) { - redirty_page_for_writepage(wbc, page); - if ((current->flags & PF_MEMALLOC) || - (inode->i_sb->s_blocksize == PAGE_SIZE)) { - /* - * For memory cleaning there's no point in writing only - * some buffers. So just bail out. Warn if we came here - * from direct reclaim. - */ - WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) - == PF_MEMALLOC); - unlock_page(page); - return 0; - } - } - - if (PageChecked(page) && ext4_should_journal_data(inode)) - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - return __ext4_journalled_writepage(page, len); - - ext4_io_submit_init(&io_submit, wbc); - io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_submit.io_end) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return -ENOMEM; - } - ret = ext4_bio_write_page(&io_submit, page, len); - unlock_page(page); - ext4_io_submit(&io_submit); - /* Drop io_end reference we got from init */ - ext4_put_io_end_defer(io_submit.io_end); - return ret; -} - static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) { mpd->first_page++; @@ -2563,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page) return false; } +static int ext4_journal_page_buffers(handle_t *handle, struct page *page, + int len) +{ + struct buffer_head *page_bufs = page_buffers(page); + struct inode *inode = page->mapping->host; + int ret, err; + + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); + if (ret == 0) + ret = err; + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); + if (ret == 0) + ret = err; + EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; + + ext4_set_inode_state(inode, EXT4_STATE_JDATA); + + return ret; +} + +static int mpage_journal_page_buffers(handle_t *handle, + struct mpage_da_data *mpd, + struct page *page) +{ + struct inode *inode = mpd->inode; + loff_t size = i_size_read(inode); + int len; + + ClearPageChecked(page); + clear_page_dirty_for_io(page); + mpd->wbc->nr_to_write--; + + if (page->index == size >> PAGE_SHIFT && + !ext4_verity_in_progress(inode)) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + + return ext4_journal_page_buffers(handle, page, len); +} + /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages @@ -2595,11 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; + handle_t *handle = NULL; + int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; + + if (ext4_should_journal_data(mpd->inode)) { + handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, + bpp); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } folio_batch_init(&fbatch); mpd->map.m_len = 0; mpd->next_page = index; @@ -2629,6 +2462,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; + if (handle) { + err = ext4_journal_ensure_credits(handle, bpp, + 0); + if (err < 0) + goto out; + } + folio_lock(folio); /* * If the page is no longer dirty, or its mapping no @@ -2668,8 +2508,15 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->first_page = folio->index; mpd->next_page = folio->index + folio_nr_pages(folio); /* - * Writeout for transaction commit where we cannot - * modify metadata is simple. Just submit the page. + * Writeout when we cannot modify metadata is simple. + * Just submit the page. For data=journal mode we + * first handle writeout of the page for checkpoint and + * only after that handle delayed page dirtying. This + * is crutial so that forcing a transaction commit and + * then calling filemap_write_and_wait() guarantees + * current state of data is in its final location. Such + * sequence is used for example by insert/collapse + * range operations before discarding the page cache. */ if (!mpd->can_map) { if (ext4_page_nomap_can_writeout(&folio->page)) { @@ -2677,6 +2524,13 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (err < 0) goto out; } + /* Pending dirtying of journalled data? */ + if (PageChecked(&folio->page)) { + err = mpage_journal_page_buffers(handle, + mpd, &folio->page); + if (err < 0) + goto out; + } mpage_page_done(mpd, &folio->page); } else { /* Add all dirty buffers to mpd */ @@ -2694,18 +2548,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) cond_resched(); } mpd->scanned_until_end = 1; + if (handle) + ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); + if (handle) + ext4_journal_stop(handle); return err; } -static int ext4_writepage_cb(struct folio *folio, struct writeback_control *wbc, - void *data) -{ - return ext4_writepage(&folio->page, wbc); -} - static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; @@ -2731,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; - if (ext4_should_journal_data(inode)) { - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); - blk_finish_plug(&plug); - goto out_writepages; - } - /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that @@ -2772,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) ext4_journal_stop(handle); } + /* + * data=journal mode does not do delalloc so we just need to writeout / + * journal already mapped buffers + */ + if (ext4_should_journal_data(inode)) + mpd->can_map = 0; + if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in @@ -3148,9 +3000,8 @@ static int ext4_da_write_end(struct file *file, * i_disksize since writeback will push i_disksize upto i_size * eventually. If the end of the current write is > i_size and * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as neither - * ext4_writepage() nor certain ext4_writepages() paths not - * allocating blocks update i_disksize. + * check), we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks update i_disksize. * * Note that we defer inode dirtying to generic_write_end() / * ext4_da_write_inline_data_end(). @@ -5376,7 +5227,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then - * confuse e.g. concurrent ext4_writepage() seeing dirty folio without + * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. @@ -6314,18 +6165,8 @@ retry_alloc: err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - do_journal_get_write_access)) - goto out_error; - if (ext4_walk_page_buffers(handle, inode, - page_buffers(page), 0, len, NULL, - write_end_fn)) - goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, - page_offset(page), len)) + if (ext4_journal_page_buffers(handle, page, len)) goto out_error; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { unlock_page(page); } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 77b426ae0064..ebccf6a6aa1b 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op, (unsigned long) __entry->index) ); -DEFINE_EVENT(ext4__page_op, ext4_writepage, - - TP_PROTO(struct page *page), - - TP_ARGS(page) -); - DEFINE_EVENT(ext4__page_op, ext4_readpage, TP_PROTO(struct page *page), -- cgit v1.2.3 From e6c28a26b799c7640b77daff3e4a67808c74381c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2023 15:25:28 +0100 Subject: ext4: Fix warnings when freezing filesystem with journaled data Test generic/390 in data=journal mode often triggers a warning that ext4_do_writepages() tries to start a transaction on frozen filesystem. This happens because although all dirty data is properly written, jbd2 checkpointing code writes data through submit_bh() and as a result only buffer dirty bits are cleared but page dirty bits stay set. Later when the filesystem is frozen, writeback code comes, tries to write supposedly dirty pages and the warning triggers. Fix the problem by calling sync_filesystem() once more after flushing the whole journal to clear stray page dirty bits. [ Applied fixup patches to address crashes when running data=journal tests; see links for more details -- TYT ] Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230308142528.12384-1-jack@suse.cz Reported-by: Eric Biggers Link: https://lore.kernel.org/all/20230319183617.GA896@sol.localdomain Link: https://lore.kernel.org/r/20230323145404.21381-1-jack@suse.cz Link: https://lore.kernel.org/r/20230323145404.21381-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 19 ++++++++++++++++--- fs/ext4/super.c | 11 +++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 652efb2221bf..6445b8017a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2410,6 +2410,7 @@ static int mpage_journal_page_buffers(handle_t *handle, static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; + struct super_block *sb = mpd->inode->i_sb; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; @@ -2427,15 +2428,23 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) else tag = PAGECACHE_TAG_DIRTY; - if (ext4_should_journal_data(mpd->inode)) { + mpd->map.m_len = 0; + mpd->next_page = index; + /* + * Start a transaction for writeback of journalled data. We don't start + * start the transaction if the filesystem is frozen. In that case we + * should not have any dirty data to write anymore but possibly there + * are stray page dirty bits left by the checkpointing code so this + * loop clears them. + */ + if (ext4_should_journal_data(mpd->inode) && + sb->s_writers.frozen < SB_FREEZE_FS) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); - mpd->map.m_len = 0; - mpd->next_page = index; while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); @@ -2520,12 +2529,16 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) */ if (!mpd->can_map) { if (ext4_page_nomap_can_writeout(&folio->page)) { + WARN_ON_ONCE(sb->s_writers.frozen == + SB_FREEZE_COMPLETE); err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; } /* Pending dirtying of journalled data? */ if (PageChecked(&folio->page)) { + WARN_ON_ONCE(sb->s_writers.frozen >= + SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, mpd, &folio->page); if (err < 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f43e526112ae..f226f8ab469b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6293,6 +6293,17 @@ static int ext4_freeze(struct super_block *sb) if (error < 0) goto out; + /* + * Do another sync. We really should not have any dirty data + * anymore but our checkpointing code does not clear page dirty + * bits due to locking constraints so writeback still can get + * started for inodes with journalled data which triggers + * annoying warnings. + */ + error = sync_filesystem(sb); + if (error < 0) + goto out; + /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) -- cgit v1.2.3 From 98ccceee3e0637a37e20c1c12a08173663db77e7 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 15 Mar 2023 14:34:18 -0400 Subject: ext4: fix comment: "start start" -> "start" in mpage_prepare_extent_to_map() Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6445b8017a8e..dbcc8b48c7ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2432,7 +2432,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = index; /* * Start a transaction for writeback of journalled data. We don't start - * start the transaction if the filesystem is frozen. In that case we + * the transaction if the filesystem is frozen. In that case we * should not have any dirty data to write anymore but possibly there * are stray page dirty bits left by the checkpointing code so this * loop clears them. -- cgit v1.2.3 From b5aa06bfe9ad13c75d425839ee2aca045e3c688c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:13 +0800 Subject: ext4: properly handle error of ext4_init_block_bitmap in ext4_read_block_bitmap_nowait We mark buffer_head of bitmap successfully initialized even error occurs in ext4_init_block_bitmap. Although we will return error, we will get a invalid buffer_head of bitmap from next ext4_read_block_bitmap_nowait which is marked buffer_verified but not successfully initialized actually in previous ext4_read_block_bitmap_nowait. Fix this by only marking buffer_head successfully initialized if ext4_init_block_bitmap successes. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8ff4b9192a9f..10cab743e313 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -474,16 +474,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); - set_bitmap_uptodate(bh); - set_buffer_uptodate(bh); - set_buffer_verified(bh); - ext4_unlock_group(sb, block_group); - unlock_buffer(bh); if (err) { + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); goto verify; } ext4_unlock_group(sb, block_group); -- cgit v1.2.3 From 3d61ef10f521b5c6906a5ba71573d5503aee2aa5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:14 +0800 Subject: ext4: correct validation check of inode table in ext4_valid_block_bitmap 1.Last valid cluster of inode table is EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1). We should make sure last valid cluster is < max_bit, i.e., EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) is < max_bit rather than EXT4_B2C(sbi, offset + sbi->s_itb_per_group) is < max_bit. 2.Bit search length should be last valid cluster plus 1, i.e., EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1 rather than EXT4_B2C(sbi, offset + sbi->s_itb_per_group). Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 10cab743e313..22be5cd70505 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -350,13 +350,13 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || - EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, - EXT4_B2C(sbi, offset + sbi->s_itb_per_group), + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1, EXT4_B2C(sbi, offset)); if (next_zero_bit < - EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) + EXT4_B2C(sbi, offset + sbi->s_itb_per_group - 1) + 1) /* bad bitmap for inode tables */ return blk; return 0; -- cgit v1.2.3 From a38627f14356f505f621b31197fd872b99a10563 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:15 +0800 Subject: ext4: call ext4_bg_num_gdb_[no]meta directly in ext4_num_base_meta_clusters ext4_num_base_meta_clusters is already aware of meta_bg feature and test if block_group is inside real meta block groups before calling ext4_bg_num_gdb. Then ext4_bg_num_gdb will check if block group is inside a real meta block groups again to decide either ext4_bg_num_gdb_meta or ext4_bg_num_gdb_nometa is needed. Call ext4_bg_num_gdb_meta or ext4_bg_num_gdb_nometa directly after we check if block_group is inside a meta block groups in ext4_num_base_meta_clusters to remove redundant check of meta block groups in ext4_bg_num_gdb. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 22be5cd70505..9b8a32b90ddc 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -889,11 +889,11 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_nometa(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ - num += ext4_bg_num_gdb(sb, block_group); + num += ext4_bg_num_gdb_meta(sb, block_group); } return EXT4_NUM_B2C(sbi, num); } -- cgit v1.2.3 From ad3f09be6cfe332be8ff46c78e6ec0f8839107aa Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:16 +0800 Subject: ext4: remove unnecessary check in ext4_bg_num_gdb_nometa We only call ext4_bg_num_gdb_nometa if there is no meta_bg feature or group does not reach first_meta_bg, so group must not reside in meta group. Then group has a global gdt if superblock exists. Just remove confusing branch that meta_bg feature exists. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9b8a32b90ddc..08f1692f7d2f 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -844,10 +844,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, if (!ext4_bg_has_super(sb, group)) return 0; - if (ext4_has_feature_meta_bg(sb)) - return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); - else - return EXT4_SB(sb)->s_gdb_count; + return EXT4_SB(sb)->s_gdb_count; } /** -- cgit v1.2.3 From f567ea7843562b7d7dfe1e6cfea455a1e9623208 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:17 +0800 Subject: ext4: remove stale comment in ext4_init_block_bitmap Commit bdfb6ff4a255d ("ext4: mark group corrupt on group descriptor checksum") added flag to indicate corruption of group instead of marking all blocks used. Just remove stale comment. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 08f1692f7d2f..4de06d68e9e7 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -187,8 +187,6 @@ static int ext4_init_block_bitmap(struct super_block *sb, ASSERT(buffer_locked(bh)); - /* If checksum is bad mark all blocks used to prevent allocation - * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | -- cgit v1.2.3 From cefa74d004fa48bc073ad66b7a7ad00dc8d7bfa6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:18 +0800 Subject: ext4: stop trying to verify just initialized bitmap in ext4_read_block_bitmap_nowait For case we initialize a bitmap bh, we will set bitmap bh verified. We can return immediately instead of goto verify to remove unnecessary work for trying to verify bitmap bh. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 4de06d68e9e7..dab46274d591 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -484,7 +484,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); - goto verify; + return bh; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { -- cgit v1.2.3 From 68e294dcccf965c93d65c4660547794728a4cff0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 21 Feb 2023 19:59:19 +0800 Subject: ext4: improve inode table blocks counting in ext4_num_overhead_clusters As inode table blocks are contiguous, inode table blocks inside the block_group can be represented as range [itbl_cluster_start, itbl_cluster_last]. Then we can simply account inode table cluters and check cluster overlap with [itbl_cluster_start, itbl_cluster_last] instead of traverse each block of inode table. By the way, this patch fixes code style problem of comment for ext4_num_overhead_clusters. [ Merged fix-up patch which fixed potentially access to an uninitialzied stack variable. --TYT ] Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230221115919.1918161-8-shikemeng@huaweicloud.com Link: https://lore.kernel.org/r/202303171446.eLEhZzAu-lkp@intel.com/ Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 90 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 47 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index dab46274d591..535a8fb67ae6 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -80,32 +80,56 @@ static inline int ext4_block_in_group(struct super_block *sb, return (actual_group == block_group) ? 1 : 0; } -/* Return the number of clusters used for file system metadata; this +/* + * Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - unsigned num_clusters; - int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; + unsigned base_clusters, num_clusters; + int block_cluster = -1, inode_cluster; + int itbl_cluster_start = -1, itbl_cluster_end = -1; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); - ext4_fsblk_t itbl_blk; + ext4_fsblk_t end = start + EXT4_BLOCKS_PER_GROUP(sb) - 1; + ext4_fsblk_t itbl_blk_start, itbl_blk_end; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ - num_clusters = ext4_num_base_meta_clusters(sb, block_group); + base_clusters = ext4_num_base_meta_clusters(sb, block_group); + num_clusters = base_clusters; /* - * For the allocation bitmaps and inode table, we first need - * to check to see if the block is in the block group. If it - * is, then check to see if the cluster is already accounted - * for in the clusters used for the base metadata cluster, or - * if we can increment the base metadata cluster to include - * that block. Otherwise, we will have to track the cluster - * used for the allocation bitmap or inode table explicitly. + * Account and record inode table clusters if any cluster + * is in the block group, or inode table cluster range is + * [-1, -1] and won't overlap with block/inode bitmap cluster + * accounted below. + */ + itbl_blk_start = ext4_inode_table(sb, gdp); + itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; + if (itbl_blk_start <= end && itbl_blk_end >= start) { + itbl_blk_start = itbl_blk_start >= start ? + itbl_blk_start : start; + itbl_blk_end = itbl_blk_end <= end ? + itbl_blk_end : end; + + itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); + itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); + + num_clusters += itbl_cluster_end - itbl_cluster_start + 1; + /* check if border cluster is overlapped */ + if (itbl_cluster_start == base_clusters - 1) + num_clusters--; + } + + /* + * For the allocation bitmaps, we first need to check to see + * if the block is in the block group. If it is, then check + * to see if the cluster is already accounted for in the clusters + * used for the base metadata cluster and inode tables cluster. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. @@ -113,46 +137,26 @@ static unsigned ext4_num_overhead_clusters(struct super_block *sb, if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); - if (block_cluster < num_clusters) - block_cluster = -1; - else if (block_cluster == num_clusters) { + if (block_cluster >= base_clusters && + (block_cluster < itbl_cluster_start || + block_cluster > itbl_cluster_end)) num_clusters++; - block_cluster = -1; - } } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); - if (inode_cluster < num_clusters) - inode_cluster = -1; - else if (inode_cluster == num_clusters) { - num_clusters++; - inode_cluster = -1; - } - } - - itbl_blk = ext4_inode_table(sb, gdp); - for (i = 0; i < sbi->s_itb_per_group; i++) { - if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { - c = EXT4_B2C(sbi, itbl_blk + i - start); - if ((c < num_clusters) || (c == inode_cluster) || - (c == block_cluster) || (c == itbl_cluster)) - continue; - if (c == num_clusters) { - num_clusters++; - continue; - } + /* + * Additional check if inode bitmap is in just accounted + * block_cluster + */ + if (inode_cluster != block_cluster && + inode_cluster >= base_clusters && + (inode_cluster < itbl_cluster_start || + inode_cluster > itbl_cluster_end)) num_clusters++; - itbl_cluster = c; - } } - if (block_cluster != -1) - num_clusters++; - if (inode_cluster != -1) - num_clusters++; - return num_clusters; } -- cgit v1.2.3 From b83acc77718644b2c19832226ca284ce01efc550 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 22 Feb 2023 04:30:24 +0800 Subject: ext4: remove unused group parameter in ext4_inode_bitmap_csum_verify Remove unused group parameter in ext4_inode_bitmap_csum_verify. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230221203027.2359920-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 2 +- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index f63e028c638c..3bb28fad624f 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -16,7 +16,7 @@ unsigned int ext4_count_free(char *bitmap, unsigned int numchars) return numchars * BITS_PER_BYTE - memweight(bitmap, numchars); } -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 08b29c289da4..0ed009bb0921 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2679,7 +2679,7 @@ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 157663031f8c..908228064f5c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -98,7 +98,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, if (buffer_verified(bh)) goto verified; blk = ext4_inode_bitmap(sb, desc); - if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh, + if (!ext4_inode_bitmap_csum_verify(sb, desc, bh, EXT4_INODES_PER_GROUP(sb) / 8) || ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { ext4_unlock_group(sb, block_group); -- cgit v1.2.3 From 4fd873c8175dd695aa3de2473951c51715b64d8c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 22 Feb 2023 04:30:25 +0800 Subject: ext4: remove unused group parameter in ext4_inode_bitmap_csum_set Remove unused group parameter in ext4_inode_bitmap_csum_set. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230221203027.2359920-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 2 +- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 6 +++--- fs/ext4/resize.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 3bb28fad624f..0186b894f5b3 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -38,7 +38,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, return provided == calculated; } -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0ed009bb0921..55f115e97145 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2676,7 +2676,7 @@ struct mmpd_data { /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); -void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); int ext4_inode_bitmap_csum_verify(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 908228064f5c..f4223749ac2e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -327,7 +327,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (percpu_counter_initialized(&sbi->s_dirs_counter)) percpu_counter_dec(&sbi->s_dirs_counter); } - ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -852,7 +852,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1); if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } @@ -1222,7 +1222,7 @@ got: } } if (ext4_has_group_desc_csum(sb)) { - ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh, + ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); ext4_group_desc_csum_set(sb, group, gdp); } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6b91443d6bf3..607a1af00665 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1318,7 +1318,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->inode_bitmap); if (!bh) return -EIO; - ext4_inode_bitmap_csum_set(sb, group, gdp, bh, + ext4_inode_bitmap_csum_set(sb, gdp, bh, EXT4_INODES_PER_GROUP(sb) / 8); brelse(bh); -- cgit v1.2.3 From 82483dfe17d036146f708809b33845c8aaf029b5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 22 Feb 2023 04:30:26 +0800 Subject: ext4: remove unused group parameter in ext4_block_bitmap_csum_verify Remove unused group parameter in ext4_block_bitmap_csum_verify. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230221203027.2359920-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 3 +-- fs/ext4/bitmap.c | 2 +- fs/ext4/ext4.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 535a8fb67ae6..094269488183 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -385,8 +385,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; - if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, - desc, bh) || + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 0186b894f5b3..3b83d979a650 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -54,7 +54,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); } -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 55f115e97145..1b97a59f913b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2685,7 +2685,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *gdp, struct buffer_head *bh); -int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, +int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); -- cgit v1.2.3 From 1df9bde48fc6e8efe520f4d89ff35769b2c56b8b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 22 Feb 2023 04:30:27 +0800 Subject: ext4: remove unused group parameter in ext4_block_bitmap_csum_set Remove unused group parameter in ext4_block_bitmap_csum_set. After this, group parameter in ext4_set_bitmap_checksums is also not used, just remove it too. Signed-off-by: Kemeng Shi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230221203027.2359920-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 2 +- fs/ext4/ext4.h | 2 +- fs/ext4/ialloc.c | 6 ++---- fs/ext4/mballoc.c | 10 +++++----- fs/ext4/resize.c | 5 ++--- 5 files changed, 11 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 3b83d979a650..87c1c8ae9298 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -80,7 +80,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, return 0; } -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1b97a59f913b..9b2cfc32cf78 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2682,7 +2682,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh, int sz); -void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, +void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f4223749ac2e..787ab89c2c26 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -813,8 +813,7 @@ int ext4_mark_inode_used(struct super_block *sb, int ino) gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); @@ -1165,8 +1164,7 @@ got: gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); - ext4_block_bitmap_csum_set(sb, group, gdp, - block_bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); } ext4_unlock_group(sb, group); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5b2ae37a8b80..0d69cd8c5fc3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3822,7 +3822,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_group_clusters_set(sb, gdp, len); - ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); ext4_unlock_group(sb, ac->ac_b_ex.fe_group); @@ -3929,7 +3929,7 @@ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, clen = ext4_free_group_clusters(sb, gdp) + clen_changed; ext4_free_group_clusters_set(sb, gdp, clen); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); @@ -5861,7 +5861,7 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_free_group_clusters_set( sb, gdp, ext4_free_group_clusters(sb, gdp) + count - already_freed); - ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); sync_dirty_buffer(bitmap_bh); @@ -6023,7 +6023,7 @@ do_more: ret = ext4_free_group_clusters(sb, gdp) + count_clusters; ext4_free_group_clusters_set(sb, gdp, ret); - ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh); + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, gdp); ext4_unlock_group(sb, block_group); @@ -6280,7 +6280,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, free_clusters_count = clusters_freed + ext4_free_group_clusters(sb, desc); ext4_free_group_clusters_set(sb, desc, free_clusters_count); - ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh); + ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); ext4_group_desc_csum_set(sb, block_group, desc); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 607a1af00665..0361c20910de 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1306,7 +1306,6 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block) } static int ext4_set_bitmap_checksums(struct super_block *sb, - ext4_group_t group, struct ext4_group_desc *gdp, struct ext4_new_group_data *group_data) { @@ -1325,7 +1324,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, bh = ext4_get_bitmap(sb, group_data->block_bitmap); if (!bh) return -EIO; - ext4_block_bitmap_csum_set(sb, group, gdp, bh); + ext4_block_bitmap_csum_set(sb, gdp, bh); brelse(bh); return 0; @@ -1363,7 +1362,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb, memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap); ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap); - err = ext4_set_bitmap_checksums(sb, group, gdp, group_data); + err = ext4_set_bitmap_checksums(sb, gdp, group_data); if (err) { ext4_std_error(sb, err); break; -- cgit v1.2.3 From b07ffe6927c75d99af534d685282ea188d9f71a6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:01 +0800 Subject: ext4: set goal start correctly in ext4_mb_normalize_request We need to set ac_g_ex to notify the goal start used in ext4_mb_find_by_goal. Set ac_g_ex instead of ac_f_ex in ext4_mb_normalize_request. Besides we should assure goal start is in range [first_data_block, blocks_count) as ext4_mb_initialize_context does. [ Added a check to make sure size is less than ar->pright; otherwise we could end up passing an underflowed value of ar->pright - size to ext4_get_group_no_and_offset(), which will trigger a BUG_ON later on. - TYT ] Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230303172120.3800725-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0d69cd8c5fc3..5a8fb0256752 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3993,6 +3993,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; int bsbits, max; ext4_lblk_t end; loff_t size, start_off; @@ -4188,18 +4189,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); /* define goal start in order to merge */ - if (ar->pright && (ar->lright == (start + size))) { + if (ar->pright && (ar->lright == (start + size)) && + ar->pright >= size && + ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - if (ar->pleft && (ar->lleft + 1 == start)) { + if (ar->pleft && (ar->lleft + 1 == start) && + ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, - &ac->ac_f_ex.fe_group, - &ac->ac_f_ex.fe_start); + &ac->ac_g_ex.fe_group, + &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } -- cgit v1.2.3 From 01e4ca29451760b9ac10b4cdc231c52150842643 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:02 +0800 Subject: ext4: allow to find by goal if EXT4_MB_HINT_GOAL_ONLY is set If EXT4_MB_HINT_GOAL_ONLY is set, ext4_mb_regular_allocator will only allocate blocks from ext4_mb_find_by_goal. Allow to find by goal in ext4_mb_find_by_goal if EXT4_MB_HINT_GOAL_ONLY is set or allocation with EXT4_MB_HINT_GOAL_ONLY set will always fail. EXT4_MB_HINT_GOAL_ONLY is not used at all, so the problem is not found for now. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 5a8fb0256752..8e706ae74588 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2162,7 +2162,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; - if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; -- cgit v1.2.3 From 22fab984025313198559278814147c57901106f6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:03 +0800 Subject: ext4: get correct ext4_group_info in ext4_mb_prefetch_fini We always get ext4_group_desc with group + 1 and ext4_group_info with group to check if we need do initialize ext4_group_info for the group. Just get ext4_group_desc with group for ext4_group_info initialization check. Signed-off-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230303172120.3800725-4-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8e706ae74588..c3c4d9e26b39 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2569,14 +2569,14 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { - while (nr-- > 0) { - struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, - NULL); - struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_group_desc *gdp; + struct ext4_group_info *grp; + while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; + gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (EXT4_MB_GRP_NEED_INIT(grp) && -- cgit v1.2.3 From abc075d4a5dc48b68bd44b5ecb4e44e936d01195 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:04 +0800 Subject: ext4: correct calculation of s_mb_preallocated We will add pa_free to s_mb_preallocated when new ext4_prealloc_space is created. In ext4_mb_new_inode_pa, we will call ext4_mb_use_inode_pa before adding pa_free to s_mb_preallocated. However, ext4_mb_use_inode_pa will consume pa_free for block allocation which triggerred the creation of ext4_prealloc_space. Add pa_free to s_mb_preallocated before ext4_mb_use_inode_pa to correct calculation of s_mb_preallocated. There is no such problem in ext4_mb_new_group_pa as pa_free of group pa is consumed in ext4_mb_release_context instead of ext4_mb_use_group_pa. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-5-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c3c4d9e26b39..e549e88c304b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4671,8 +4671,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); - ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); + ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); -- cgit v1.2.3 From 1afdc5889427110c8466da2b23360a27be711ff0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:05 +0800 Subject: ext4: correct start of used group pa for debug in ext4_mb_use_group_pa As we don't correct pa_lstart here, so there is no need to subtract pa_lstart with consumed len. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-6-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e549e88c304b..0af2ac678170 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4323,7 +4323,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", - pa->pa_lstart-len, len, pa); + pa->pa_lstart, len, pa); } /* -- cgit v1.2.3 From 36cb0f52aeb929b141f3f580481eb04c65b6e2b9 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:06 +0800 Subject: ext4: protect pa->pa_free in ext4_discard_allocated_blocks If ext4_mb_mark_diskspace_used fails in ext4_mb_new_blocks, we may discard pa already in list. Protect pa with pa_lock to avoid race. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-7-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0af2ac678170..9d4091fad78d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4267,8 +4267,11 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) ext4_mb_unload_buddy(&e4b); return; } - if (pa->pa_type == MB_INODE_PA) + if (pa->pa_type == MB_INODE_PA) { + spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; + spin_unlock(&pa->pa_lock); + } } /* -- cgit v1.2.3 From 1b5c9d349455c84e5211354357870f64913b3020 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:07 +0800 Subject: ext4: add missed brelse in ext4_free_blocks_simple Release bitmap buffer_head we got if error occurs. Besides, this patch remove unused assignment to err. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-8-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9d4091fad78d..87ff3bfbafd8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5849,13 +5849,12 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_get_group_no_and_offset(sb, block, &group, &blkoff); bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return; } gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) - return; + goto err_out; for (i = 0; i < count; i++) { if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) @@ -5864,7 +5863,7 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, mb_clear_bits(bitmap_bh->b_data, blkoff, count); err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); if (err) - return; + goto err_out; ext4_free_group_clusters_set( sb, gdp, ext4_free_group_clusters(sb, gdp) + count - already_freed); @@ -5873,6 +5872,8 @@ static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); + +err_out: brelse(bitmap_bh); } -- cgit v1.2.3 From 85b67ffb7d2bab27cc6c479e949e6fab7fb41aee Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:08 +0800 Subject: ext4: remove unused return value of ext4_mb_try_best_found and ext4_mb_free_metadata Return value static function ext4_mb_try_best_found and ext4_mb_free_metadata is not used. Just remove unused return value. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-9-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 87ff3bfbafd8..d141e5c2e44c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2124,7 +2124,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, } static noinline_for_stack -int ext4_mb_try_best_found(struct ext4_allocation_context *ac, +void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; @@ -2135,7 +2135,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) - return err; + return; ext4_lock_group(ac->ac_sb, group); max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); @@ -2147,8 +2147,6 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac, ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); - - return 0; } static noinline_for_stack @@ -5700,7 +5698,7 @@ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, kmem_cache_free(ext4_free_data_cachep, entry); } -static noinline_for_stack int +static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { @@ -5743,7 +5741,7 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); - return 0; + return; } } @@ -5769,7 +5767,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); - return 0; } /* -- cgit v1.2.3 From 139f46d3b5e69937025faa38cb0a3711b9808158 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:09 +0800 Subject: ext4: Remove unnecessary release when memory allocation failed in ext4_mb_init_cache If we alloc array of buffer_head failed, there is no resource need to be freed and we can simpily return error. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-10-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d141e5c2e44c..17cfe6c2b582 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1168,10 +1168,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); - if (bh == NULL) { - err = -ENOMEM; - goto out; - } + if (bh == NULL) + return -ENOMEM; } else bh = &bhs; -- cgit v1.2.3 From 285164b80175157c18a06425cf25591c9f942b1a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:10 +0800 Subject: ext4: remove unnecessary e4b->bd_buddy_page check in ext4_mb_load_buddy_gfp e4b->bd_buddy_page is only set if we initialize ext4_buddy successfully. So e4b->bd_buddy_page is always NULL in error handle branch. Just remove the dead check. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-11-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 17cfe6c2b582..8d3d59876a4a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1555,8 +1555,7 @@ err: put_page(page); if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); - if (e4b->bd_buddy_page) - put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; -- cgit v1.2.3 From aaae558dae6f804407a50e0eaf66c93b46b9a493 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:11 +0800 Subject: ext4: remove unnecessary check in ext4_mb_new_blocks 1. remove unnecessary ac check: We always go to out tag before ac is successfully allocated, then we can move out tag after free of ac and remove NULL check of ac. 2. remove unnecessary *errp check: We always go to errout tag if *errp is non-zero, then we can move errout tag into error handle if *errp is non-zero. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-12-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d3d59876a4a..65dac09c4cea 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5642,16 +5642,15 @@ repeat: *errp = -ENOSPC; } -errout: if (*errp) { +errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); + kmem_cache_free(ext4_ac_cachep, ac); out: - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { -- cgit v1.2.3 From 976620bd261025c86bafe79c15170dc3da7a66f4 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:12 +0800 Subject: ext4: remove dead check in mb_buddy_mark_free We always adjust first to even number and adjust last to odd number, so first == last will never happen. Remove this dead check. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230303172120.3800725-13-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 65dac09c4cea..591bd8e5e5c7 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1718,7 +1718,8 @@ static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) break; order++; - if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) { + buddy2 = mb_find_buddy(e4b, order, &max); + if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; -- cgit v1.2.3 From 32c0869370194ae5ac9f9f501953ef693040f6a1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:13 +0800 Subject: ext4: remove ac->ac_found > sbi->s_mb_min_to_scan dead check in ext4_mb_check_limits Only call trace of ext4_mb_check_limits is as following: ext4_mb_complex_scan_group ext4_mb_measure_extent ext4_mb_check_limits(ac, e4b, 0); ext4_mb_check_limits(ac, e4b, 1); If the first ac->ac_found > sbi->s_mb_max_to_scan check in ext4_mb_check_limits is met, we will set ac_status to AC_STATUS_BREAK and call ext4_mb_try_best_found to try to use ac->ac_b_ex. If ext4_mb_try_best_found successes, then block allocation finishs, the removed ac->ac_found > sbi->s_mb_min_to_scan check is not reachable. If ext4_mb_try_best_found fails, then we set EXT4_MB_HINT_FIRST and reset ac->ac_b_ex to retry block allocation. We will use any found free extent in ext4_mb_measure_extent before reach the removed ac->ac_found > sbi->s_mb_min_to_scan check. In summary, the removed ac->ac_found > sbi->s_mb_min_to_scan check is not reachable and we can remove that dead check. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-14-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 591bd8e5e5c7..eeafa1bba824 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2039,8 +2039,7 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, if (bex->fe_len < gex->fe_len) return; - if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan) - && bex->fe_group == e4b->bd_group) { + if (finish_group && bex->fe_group == e4b->bd_group) { /* recheck chunk's availability - we don't know * when it was found (within this lock-unlock * period or not) */ -- cgit v1.2.3 From 78dc9f844f4ec999a30517313040948a4c4bbc00 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:14 +0800 Subject: ext4: use best found when complex scan of group finishs If any bex which meets bex->fe_len >= gex->fe_len is found, then it will always be used when complex scan of group that bex belongs to finishs. So there will not be any lock-unlock period. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-15-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index eeafa1bba824..a12c33bc2756 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2019,8 +2019,6 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; - struct ext4_free_extent ex; - int max; if (ac->ac_status == AC_STATUS_FOUND) return; @@ -2039,16 +2037,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, if (bex->fe_len < gex->fe_len) return; - if (finish_group && bex->fe_group == e4b->bd_group) { - /* recheck chunk's availability - we don't know - * when it was found (within this lock-unlock - * period or not) */ - max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex); - if (max >= gex->fe_len) { - ext4_mb_use_best_found(ac, e4b); - return; - } - } + if (finish_group) + ext4_mb_use_best_found(ac, e4b); } /* -- cgit v1.2.3 From df11909514c5243a62fcd5e122f40f19f3a7076c Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:15 +0800 Subject: ext4: remove unnecessary exit_meta_group_info tag We goto exit_meta_group_info only to return -ENOMEM. Return -ENOMEM directly instead of goto to remove this unnecessary tag. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-16-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a12c33bc2756..4364343bbcef 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3069,7 +3069,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); - goto exit_meta_group_info; + return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; @@ -3123,7 +3123,6 @@ exit_group_info: group_info[idx] = NULL; rcu_read_unlock(); } -exit_meta_group_info: return -ENOMEM; } /* ext4_mb_add_groupinfo */ -- cgit v1.2.3 From c7f2bafa3c242d4c69ba347d8983858a5137387b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:16 +0800 Subject: ext4: remove unnecessary count2 in ext4_free_data_in_buddy count2 is always 1 in mb_debug, just remove unnecessary count2. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-17-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4364343bbcef..f7cff3b5d740 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3590,7 +3590,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, { struct ext4_buddy e4b; struct ext4_group_info *db; - int err, count = 0, count2 = 0; + int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); @@ -3606,7 +3606,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; - count2++; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); @@ -3631,8 +3630,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); - mb_debug(sb, "freed %d blocks in %d structures\n", count, - count2); + mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* -- cgit v1.2.3 From fb28f9ceec5622803e189a6ced7fdf26b7f17e7f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:17 +0800 Subject: ext4: remove unnecessary goto in ext4_mb_mark_diskspace_used When ext4_read_block_bitmap fails, we can return PTR_ERR(bitmap_bh) to remove unnecessary NULL check of bitmap_bh. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230303172120.3800725-18-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f7cff3b5d740..4136e53341c3 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3739,9 +3739,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - bitmap_bh = NULL; - goto out_err; + return PTR_ERR(bitmap_bh); } BUFFER_TRACE(bitmap_bh, "getting write access"); -- cgit v1.2.3 From 3a037b1b889745b32c37d876223c7692721e8e95 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:18 +0800 Subject: ext4: remove repeat assignment to ac_f_ex Call trace to assign ac_f_ex: ext4_mb_use_best_found ac->ac_f_ex = ac->ac_b_ex; ext4_mb_new_preallocation ext4_mb_new_group_pa ac->ac_f_ex = ac->ac_b_ex; ext4_mb_new_inode_pa ac->ac_f_ex = ac->ac_b_ex; Actually allocated blocks is already stored in ac_f_ex in ext4_mb_use_best_found, so there is no need to assign ac_f_ex in ext4_mb_new_group_pa and ext4_mb_new_inode_pa. Just remove repeat assignment to ac_f_ex in ext4_mb_new_group_pa and ext4_mb_new_inode_pa. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-19-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4136e53341c3..59aa42457fca 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4636,10 +4636,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); } - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; @@ -4690,10 +4686,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; - /* preallocation can change ac_b_ex, thus we store actually - * allocated blocks for history */ - ac->ac_f_ex = ac->ac_b_ex; - pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; -- cgit v1.2.3 From 46825e949057481ccb1f0e52c512fd52f1938cb3 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:19 +0800 Subject: ext4: remove comment code ext4_discard_preallocations Just remove comment code in ext4_discard_preallocations. Signed-off-by: Kemeng Shi Reviewed-by: Ojaswin Mujoo Link: https://lore.kernel.org/r/20230303172120.3800725-20-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 59aa42457fca..4c02a9263eb9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4926,7 +4926,6 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) int err; if (!S_ISREG(inode->i_mode)) { - /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } -- cgit v1.2.3 From 253cacb0de89235673ad5889d61f275a73dbee79 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sat, 4 Mar 2023 01:21:20 +0800 Subject: ext4: simplify calculation of blkoff in ext4_mb_new_blocks_simple We try to allocate a block from goal in ext4_mb_new_blocks_simple. We only need get blkoff in first group with goal and set blkoff to 0 for the rest groups. Signed-off-by: Kemeng Shi Link: https://lore.kernel.org/r/20230303172120.3800725-21-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4c02a9263eb9..f09eb7f7fa5a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5773,9 +5773,6 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, return 0; } - ext4_get_group_no_and_offset(sb, - max(ext4_group_first_block_no(sb, group), goal), - NULL, &blkoff); while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); @@ -5790,6 +5787,8 @@ static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle, brelse(bitmap_bh); if (i < max) break; + + blkoff = 0; } if (group >= ext4_get_groups_count(sb) || i >= max) { -- cgit v1.2.3 From 1221b235019f9e3bbc38cfc53f7c03456bee53b4 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 12 Mar 2023 01:09:48 +0800 Subject: ext4: fix typos in mballoc pa_plen -> pa_len pa_start -> pa_pstart Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230311170949.1047958-2-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f09eb7f7fa5a..0b03a3b90207 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4146,7 +4146,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks - * [pa_start + fe_logical - pa_lstart, fe_len/size] from the preallocated + * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) @@ -4299,7 +4299,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; - /* we don't correct pa_pstart or pa_plen here to avoid + /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() -- cgit v1.2.3 From 91a48aaf59d0da6b93e49de87420c30aba0acaa8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Sun, 12 Mar 2023 01:09:49 +0800 Subject: ext4: avoid unnecessary pointer dereference in ext4_mb_normalize_request Result of EXT4_SB(ac->ac_sb) is already stored in sbi at beginning of ext4_mb_normalize_request. Use sbi instead of EXT4_SB(ac->ac_sb) to remove unnecessary pointer dereference. Signed-off-by: Kemeng Shi Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230311170949.1047958-3-shikemeng@huaweicloud.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0b03a3b90207..05a1f19c925b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4049,7 +4049,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; - size = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb), + size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; @@ -4094,8 +4094,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, continue; } - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); + pa_end = pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); /* PA must not overlap original request */ BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || @@ -4128,8 +4127,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb), - pa->pa_len); + pa_end = pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); } spin_unlock(&pa->pa_lock); -- cgit v1.2.3 From 19b8b035a776939ceb3de0f45aded4751d7849ef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 16 Mar 2023 17:07:32 -0400 Subject: ext4: convert some BUG_ON's in mballoc to use WARN_RATELIMITED instead In cases where we have an obvious way of continuing, let's use WARN_RATELIMITED() instead of BUG_ON(). Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 05a1f19c925b..90b061edb57d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1487,7 +1487,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: bitmap's paging->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, NULL, gfp); if (ret) { @@ -1523,7 +1529,13 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, put_page(page); page = find_or_create_page(inode->i_mapping, pnum, gfp); if (page) { - BUG_ON(page->mapping != inode->i_mapping); + if (WARN_RATELIMIT(page->mapping != inode->i_mapping, + "ext4: buddy bitmap's page->mapping != inode->i_mapping\n")) { + /* should never happen */ + unlock_page(page); + ret = -EINVAL; + goto err; + } if (!PageUptodate(page)) { ret = ext4_mb_init_cache(page, e4b->bd_bitmap, gfp); @@ -2221,7 +2233,9 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, continue; buddy = mb_find_buddy(e4b, i, &max); - BUG_ON(buddy == NULL); + if (WARN_RATELIMIT(buddy == NULL, + "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) + continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { @@ -4229,15 +4243,14 @@ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); - if (err) { + if (WARN_RATELIMIT(err, + "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ - WARN(1, "mb_load_buddy failed (%d)", err); return; - } ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); -- cgit v1.2.3 From e86a718228b61eef747b8deb446f807b2be73148 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:34 +0530 Subject: ext4: Stop searching if PA doesn't satisfy non-extent file If we come across a PA that matches the logical offset but is unable to satisfy a non-extent file due to its physical start being higher than that supported by non extent files, then simply stop searching for another PA and break out of loop. This is because, since PAs don't overlap, we won't be able to find another inode PA which can satisfy the original request. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/42404ca29bd304ae2c962184c3c32a02e8eefcd0.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 90b061edb57d..2d4dc307941c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4380,8 +4380,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > - EXT4_MAX_BLOCK_FILE_PHYS)) - continue; + EXT4_MAX_BLOCK_FILE_PHYS)) { + /* + * Since PAs don't overlap, we won't find any + * other PA to satisfy this. + */ + break; + } /* found preallocated blocks, use them */ spin_lock(&pa->pa_lock); -- cgit v1.2.3 From 820897258ad342e78388ee9f5814fc485e79102a Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:35 +0530 Subject: ext4: Refactor code related to freeing PAs This patch makes the following changes: * Rename ext4_mb_pa_free to ext4_mb_pa_put_free to better reflect its purpose * Add new ext4_mb_pa_free() which only handles freeing * Refactor ext4_mb_pa_callback() to use ext4_mb_pa_free() There are no functional changes in this patch Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/b273bc9cbf5bd278f641fa5bc6c0cc9e6cb3330c.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2d4dc307941c..58f16002cb2c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4528,16 +4528,22 @@ static void ext4_mb_mark_pa_deleted(struct super_block *sb, } } -static void ext4_mb_pa_callback(struct rcu_head *head) +static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { - struct ext4_prealloc_space *pa; - pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); - + BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } +static void ext4_mb_pa_callback(struct rcu_head *head) +{ + struct ext4_prealloc_space *pa; + + pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); + ext4_mb_pa_free(pa); +} + /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed @@ -5055,14 +5061,20 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) return 0; } -static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); - kmem_cache_free(ext4_pspace_cachep, pa); + /* + * current function is only called due to an error or due to + * len of found blocks < len of requested blocks hence the PA has not + * been added to grp->bb_prealloc_list. So we don't need to lock it + */ + pa->pa_deleted = 1; + ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG @@ -5605,13 +5617,13 @@ repeat: * So we have to free this pa here itself. */ if (*errp) { - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -5630,7 +5642,7 @@ repeat: * If block allocation fails then the pa allocated above * needs to be freed here itself. */ - ext4_mb_pa_free(ac); + ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } -- cgit v1.2.3 From bcf434992145dd08bb5c8d0bd4bf34811e0a5d78 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:36 +0530 Subject: ext4: Refactor code in ext4_mb_normalize_request() and ext4_mb_use_preallocated() Change some variable names to be more consistent and refactor some of the code to make it easier to read. There are no functional changes in this patch Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/8edcab489c06cf861b19d87207d9b0ff7ac7f3c1.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 96 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 58f16002cb2c..441b3b6d9387 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3994,7 +3994,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, loff_t orig_size __maybe_unused; ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t tmp_pa_start, tmp_pa_end; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -4097,54 +4098,52 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - if (pa->pa_deleted) + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + if (tmp_pa->pa_deleted) continue; - spin_lock(&pa->pa_lock); - if (pa->pa_deleted) { - spin_unlock(&pa->pa_lock); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); continue; } - pa_end = pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end || - ac->ac_o_ex.fe_logical < pa->pa_lstart)); + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); /* skip PAs this normalized request doesn't overlap with */ - if (pa->pa_lstart >= end || pa_end <= start) { - spin_unlock(&pa->pa_lock); + if (tmp_pa_start >= end || tmp_pa_end <= start) { + spin_unlock(&tmp_pa->pa_lock); continue; } - BUG_ON(pa->pa_lstart <= start && pa_end >= end); + BUG_ON(tmp_pa_start <= start && tmp_pa_end >= end); /* adjust start or end to be adjacent to this pa */ - if (pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(pa_end < start); - start = pa_end; - } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) { - BUG_ON(pa->pa_lstart > end); - end = pa->pa_lstart; + if (tmp_pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(tmp_pa_end < start); + start = tmp_pa_end; + } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) { + BUG_ON(tmp_pa_start > end); + end = tmp_pa_start; } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); size = end - start; /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - ext4_lblk_t pa_end; - - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0) { - pa_end = pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len); - BUG_ON(!(start >= pa_end || end <= pa->pa_lstart)); + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); @@ -4359,7 +4358,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa, *cpa = NULL; + struct ext4_prealloc_space *tmp_pa, *cpa = NULL; + ext4_lblk_t tmp_pa_start, tmp_pa_end; ext4_fsblk_t goal_block; /* only data can be preallocated */ @@ -4368,18 +4368,20 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* first, try per-file preallocation */ rcu_read_lock(); - list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ - if (ac->ac_o_ex.fe_logical < pa->pa_lstart || - ac->ac_o_ex.fe_logical >= (pa->pa_lstart + - EXT4_C2B(sbi, pa->pa_len))) + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + if (ac->ac_o_ex.fe_logical < tmp_pa_start || + ac->ac_o_ex.fe_logical >= tmp_pa_end) continue; /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && - (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) > + (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) { /* * Since PAs don't overlap, we won't find any @@ -4389,16 +4391,16 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } /* found preallocated blocks, use them */ - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && pa->pa_free) { - atomic_inc(&pa->pa_count); - ext4_mb_use_inode_pa(ac, pa); - spin_unlock(&pa->pa_lock); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) { + atomic_inc(&tmp_pa->pa_count); + ext4_mb_use_inode_pa(ac, tmp_pa); + spin_unlock(&tmp_pa->pa_lock); ac->ac_criteria = 10; rcu_read_unlock(); return true; } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); @@ -4422,16 +4424,16 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], pa_inode_list) { - spin_lock(&pa->pa_lock); - if (pa->pa_deleted == 0 && - pa->pa_free >= ac->ac_o_ex.fe_len) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0 && + tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, - pa, cpa); + tmp_pa, cpa); } - spin_unlock(&pa->pa_lock); + spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } -- cgit v1.2.3 From 7692094ac513e48ee37d0f1fb057f3f7f8d53385 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:37 +0530 Subject: ext4: Move overlap assert logic into a separate function Abstract out the logic to double check for overlaps in normalize_pa to a separate function. Since there has been no reports in past where we have seen any overlaps which hits this bug_on(), in future we can consider calling this function under "#ifdef AGGRESSIVE_CHECK" only. There are no functional changes in this patch Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/35dd5d94fa0b2d1cd2d2947adf8967279c72967d.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 441b3b6d9387..db46998fca0e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3978,6 +3978,29 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } +static inline void +ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); + } + spin_unlock(&tmp_pa->pa_lock); + } + rcu_read_unlock(); +} + /* * Normalization means making request better in terms of * size and alignment @@ -4135,17 +4158,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = end - start; /* XXX: extra loop to check we really don't overlap preallocations */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0) { - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); - } - spin_unlock(&tmp_pa->pa_lock); - } - rcu_read_unlock(); + ext4_mb_pa_assert_overlap(ac, start, end); /* * In this function "start" and "size" are normalized for better -- cgit v1.2.3 From 0830344c953aeabee91ac88281585756d047df39 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:38 +0530 Subject: ext4: Abstract out overlap fix/check logic in ext4_mb_normalize_request() Abstract out the logic of fixing PA overlaps in ext4_mb_normalize_request to improve readability of code. This also makes it easier to make changes to the overlap logic in future. There are no functional changes in this patch Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/9b35f3955a1d7b66bbd713eca1e63026e01f78c1.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 109 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index db46998fca0e..e8d660ff8c45 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4001,6 +4001,73 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, rcu_read_unlock(); } +/* + * Given an allocation context "ac" and a range "start", "end", check + * and adjust boundaries if the range overlaps with any of the existing + * preallocatoins stored in the corresponding inode of the allocation context. + * + *Parameters: + * ac allocation context + * start start of the new range + * end end of the new range + */ +static inline void +ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, + ext4_lblk_t *start, ext4_lblk_t *end) +{ + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_prealloc_space *tmp_pa; + ext4_lblk_t new_start, new_end; + ext4_lblk_t tmp_pa_start, tmp_pa_end; + + new_start = *start; + new_end = *end; + + /* check we don't cross already preallocated blocks */ + rcu_read_lock(); + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + if (tmp_pa->pa_deleted) + continue; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + + /* PA must not overlap original request */ + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + + /* skip PAs this normalized request doesn't overlap with */ + if (tmp_pa_start >= new_end || tmp_pa_end <= new_start) { + spin_unlock(&tmp_pa->pa_lock); + continue; + } + BUG_ON(tmp_pa_start <= new_start && tmp_pa_end >= new_end); + + /* adjust start or end to be adjacent to this pa */ + if (tmp_pa_end <= ac->ac_o_ex.fe_logical) { + BUG_ON(tmp_pa_end < new_start); + new_start = tmp_pa_end; + } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) { + BUG_ON(tmp_pa_start > new_end); + new_end = tmp_pa_start; + } + spin_unlock(&tmp_pa->pa_lock); + } + rcu_read_unlock(); + + /* XXX: extra loop to check we really don't overlap preallocations */ + ext4_mb_pa_assert_overlap(ac, new_start, new_end); + + *start = new_start; + *end = new_end; +} + /* * Normalization means making request better in terms of * size and alignment @@ -4016,9 +4083,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, loff_t size, start_off; loff_t orig_size __maybe_unused; ext4_lblk_t start; - struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); - struct ext4_prealloc_space *tmp_pa; - ext4_lblk_t tmp_pa_start, tmp_pa_end; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -4119,47 +4183,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, end = start + size; - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { - if (tmp_pa->pa_deleted) - continue; - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted) { - spin_unlock(&tmp_pa->pa_lock); - continue; - } - - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); - - /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || - ac->ac_o_ex.fe_logical < tmp_pa_start)); - - /* skip PAs this normalized request doesn't overlap with */ - if (tmp_pa_start >= end || tmp_pa_end <= start) { - spin_unlock(&tmp_pa->pa_lock); - continue; - } - BUG_ON(tmp_pa_start <= start && tmp_pa_end >= end); + ext4_mb_pa_adjust_overlap(ac, &start, &end); - /* adjust start or end to be adjacent to this pa */ - if (tmp_pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_end < start); - start = tmp_pa_end; - } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_start > end); - end = tmp_pa_start; - } - spin_unlock(&tmp_pa->pa_lock); - } - rcu_read_unlock(); size = end - start; - /* XXX: extra loop to check we really don't overlap preallocations */ - ext4_mb_pa_assert_overlap(ac, start, end); - /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. -- cgit v1.2.3 From 93cdf49f6eca5e23f6546b8f28457b2e6a6961d9 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:39 +0530 Subject: ext4: Fix best extent lstart adjustment logic in ext4_mb_new_inode_pa() When the length of best extent found is less than the length of goal extent we need to make sure that the best extent atleast covers the start of the original request. This is done by adjusting the ac_b_ex.fe_logical (logical start) of the extent. While doing so, the current logic sometimes results in the best extent's logical range overflowing the goal extent. Since this best extent is later added to the inode preallocation list, we have a possibility of introducing overlapping preallocations. This is discussed in detail here [1]. As per Jan's suggestion, to fix this, replace the existing logic with the below logic for adjusting best extent as it keeps fragmentation in check while ensuring logical range of best extent doesn't overflow out of goal extent: 1. Check if best extent can be kept at end of goal range and still cover original start. 2. Else, check if best extent can be kept at start of goal range and still cover original start. 3. Else, keep the best extent at start of original request. Also, add a few extra BUG_ONs that might help catch errors faster. [1] https://lore.kernel.org/r/Y+OGkVvzPN0RMv0O@li-bb2b2a4c-3307-11b2-a85c-8fa5c3a69313.ibm.com Suggested-by: Jan Kara Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/f96aca6d415b36d1f90db86c1a8cd7e2e9d7ab0e.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e8d660ff8c45..7294251a7d96 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4329,6 +4329,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); + BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); @@ -4667,10 +4668,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { - int winl; - int wins; - int win; - int offs; + int new_bex_start; + int new_bex_end; /* we can't allocate as much as normalizer wants. * so, found space must get proper lstart @@ -4678,26 +4677,40 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); - /* we're limited by original request in that - * logical block must be covered any way - * winl is window we can move our chunk within */ - winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical; + /* + * Use the below logic for adjusting best extent as it keeps + * fragmentation in check while ensuring logical range of best + * extent doesn't overflow out of goal extent: + * + * 1. Check if best ex can be kept at end of goal and still + * cover original start + * 2. Else, check if best ex can be kept at start of goal and + * still cover original start + * 3. Else, keep the best ex at start of original request. + */ + new_bex_end = ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len); + new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical >= new_bex_start) + goto adjust_bex; - /* also, we should cover whole original request */ - wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len); + new_bex_start = ac->ac_g_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); + if (ac->ac_o_ex.fe_logical < new_bex_end) + goto adjust_bex; - /* the smallest one defines real window */ - win = min(winl, wins); + new_bex_start = ac->ac_o_ex.fe_logical; + new_bex_end = + new_bex_start + EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - offs = ac->ac_o_ex.fe_logical % - EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (offs && offs < win) - win = offs; +adjust_bex: + ac->ac_b_ex.fe_logical = new_bex_start; - ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - - EXT4_NUM_B2C(sbi, win); BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len); + BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical + + EXT4_C2B(sbi, ac->ac_g_ex.fe_len))); } pa->pa_lstart = ac->ac_b_ex.fe_logical; -- cgit v1.2.3 From a8e38fd37cff911638ac288adb138265f71e50c0 Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:40 +0530 Subject: ext4: Convert pa->pa_inode_list and pa->pa_obj_lock into a union ** Splitting pa->pa_inode_list ** Currently, we use the same pa->pa_inode_list to add a pa to either the inode preallocation list or the locality group preallocation list. For better clarity, split this list into a union of 2 list_heads and use either of the them based on the type of pa. ** Splitting pa->pa_obj_lock ** Currently, pa->pa_obj_lock is either assigned &ei->i_prealloc_lock for inode PAs or lg_prealloc_lock for lg PAs, and is then used to lock the lists containing these PAs. Make the distinction between the 2 PA types clear by changing this lock to a union of 2 locks. Explicitly use the pa_lock_node.inode_lock for inode PAs and pa_lock_node.lg_lock for lg PAs. This patch is required so that the locality group preallocation code remains the same as in upcoming patches we are going to make changes to inode preallocation code to move from list to rbtree based implementation. This patch also makes it easier to review the upcoming patches. There are no functional changes in this patch. Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/1d7ac0557e998c3fc7eef422b52e4bc67bdef2b0.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 76 ++++++++++++++++++++++++++++++++----------------------- fs/ext4/mballoc.h | 10 ++++++-- 2 files changed, 52 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 7294251a7d96..b50c2e5abe67 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3988,7 +3988,7 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t tmp_pa_start, tmp_pa_end; rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { tmp_pa_start = tmp_pa->pa_lstart; @@ -4026,7 +4026,7 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { if (tmp_pa->pa_deleted) continue; spin_lock(&tmp_pa->pa_lock); @@ -4409,7 +4409,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* first, try per-file preallocation */ rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_inode_list) { + list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ @@ -4466,7 +4466,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], - pa_inode_list) { + pa_node.lg_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { @@ -4640,9 +4640,15 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_INODE_PA) { + spin_lock(pa->pa_node_lock.inode_lock); + list_del_rcu(&pa->pa_node.inode_list); + spin_unlock(pa->pa_node_lock.inode_lock); + } else { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + } call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } @@ -4718,7 +4724,7 @@ adjust_bex: pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_node.inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; @@ -4733,14 +4739,14 @@ adjust_bex: ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); - pa->pa_obj_lock = &ei->i_prealloc_lock; + pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - spin_lock(pa->pa_obj_lock); - list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); + spin_lock(pa->pa_node_lock.inode_lock); + list_add_rcu(&pa->pa_node.inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } @@ -4768,7 +4774,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; @@ -4784,7 +4790,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) lg = ac->ac_lg; BUG_ON(lg == NULL); - pa->pa_obj_lock = &lg->lg_prealloc_lock; + pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); @@ -4960,9 +4966,15 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + if (pa->pa_type == MB_GROUP_PA) { + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); + } else { + spin_lock(pa->pa_node_lock.inode_lock); + list_del_rcu(&pa->pa_node.inode_list); + spin_unlock(pa->pa_node_lock.inode_lock); + } if (pa->pa_type == MB_GROUP_PA) ext4_mb_release_group_pa(&e4b, pa); @@ -5024,8 +5036,8 @@ repeat: spin_lock(&ei->i_prealloc_lock); while (!list_empty(&ei->i_prealloc_list) && needed) { pa = list_entry(ei->i_prealloc_list.prev, - struct ext4_prealloc_space, pa_inode_list); - BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock); + struct ext4_prealloc_space, pa_node.inode_list); + BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should @@ -5042,7 +5054,7 @@ repeat: if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + list_del_rcu(&pa->pa_node.inode_list); list_add(&pa->u.pa_tmp_list, &list); needed--; continue; @@ -5332,7 +5344,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { @@ -5355,7 +5367,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_inode_list); + list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; @@ -5416,7 +5428,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], - pa_inode_list, + pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { @@ -5425,8 +5437,8 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ - list_add_tail_rcu(&pa->pa_inode_list, - &tmp_pa->pa_inode_list); + list_add_tail_rcu(&pa->pa_node.lg_list, + &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total @@ -5437,7 +5449,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) lg_prealloc_count++; } if (!added) - list_add_tail_rcu(&pa->pa_inode_list, + list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); @@ -5493,9 +5505,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * doesn't grow big. */ if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); + spin_lock(pa->pa_node_lock.lg_lock); + list_del_rcu(&pa->pa_node.lg_list); + spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } @@ -5505,9 +5517,9 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) * treat per-inode prealloc list as a lru list, then try * to trim the least recently used PA. */ - spin_lock(pa->pa_obj_lock); - list_move(&pa->pa_inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_obj_lock); + spin_lock(pa->pa_node_lock.inode_lock); + list_move(&pa->pa_node.inode_list, &ei->i_prealloc_list); + spin_unlock(pa->pa_node_lock.inode_lock); } ext4_mb_put_pa(ac, ac->ac_sb, pa); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dcda2a943cee..398a6688c341 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -114,7 +114,10 @@ struct ext4_free_data { }; struct ext4_prealloc_space { - struct list_head pa_inode_list; + union { + struct list_head inode_list; /* for inode PAs */ + struct list_head lg_list; /* for lg PAs */ + } pa_node; struct list_head pa_group_list; union { struct list_head pa_tmp_list; @@ -128,7 +131,10 @@ struct ext4_prealloc_space { ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ - spinlock_t *pa_obj_lock; + union { + spinlock_t *inode_lock; /* locks the inode list holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ + } pa_node_lock; struct inode *pa_inode; /* hack, for history only */ }; -- cgit v1.2.3 From 3872778664e36528caf8b27f355e75482f6d562d Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:41 +0530 Subject: ext4: Use rbtrees to manage PAs instead of inode i_prealloc_list Currently, the kernel uses i_prealloc_list to hold all the inode preallocations. This is known to cause degradation in performance in workloads which perform large number of sparse writes on a single file. This is mainly because functions like ext4_mb_normalize_request() and ext4_mb_use_preallocated() iterate over this complete list, resulting in slowdowns when large number of PAs are present. Patch 27bc446e2 partially fixed this by enforcing a limit of 512 for the inode preallocation list and adding logic to continually trim the list if it grows above the threshold, however our testing revealed that a hardcoded value is not suitable for all kinds of workloads. To optimize this, add an rbtree to the inode and hold the inode preallocations in this rbtree. This will make iterating over inode PAs faster and scale much better than a linked list. Additionally, we also had to remove the LRU logic that was added during trimming of the list (in ext4_mb_release_context()) as it will add extra overhead in rbtree. The discards now happen in the lowest-logical-offset-first order. ** Locking notes ** With the introduction of rbtree to maintain inode PAs, we can't use RCU to walk the tree for searching since it can result in partial traversals which might miss some nodes(or entire subtrees) while discards happen in parallel (which happens under a lock). Hence this patch converts the ei->i_prealloc_lock spin_lock to rw_lock. Almost all the codepaths that read/modify the PA rbtrees are protected by the higher level inode->i_data_sem (except ext4_mb_discard_group_preallocations() and ext4_clear_inode()) IIUC, the only place we need lock protection is when one thread is reading "searching" the PA rbtree (earlier protected under rcu_read_lock()) and another is "deleting" the PAs in ext4_mb_discard_group_preallocations() function (which iterates all the PAs using the grp->bb_prealloc_list and deletes PAs from the tree without taking any inode lock (i_data_sem)). So, this patch converts all rcu_read_lock/unlock() paths for inode list PA to use read_lock() and all places where we were using ei->i_prealloc_lock spinlock will now be using write_lock(). Note that this makes the fast path (searching of the right PA e.g. ext4_mb_use_preallocated() or ext4_mb_normalize_request()), now use read_lock() instead of rcu_read_lock/unlock(). Ths also will now block due to slow discard path (ext4_mb_discard_group_preallocations()) which uses write_lock(). But this is not as bad as it looks. This is because - 1. The slow path only occurs when the normal allocation failed and we can say that we are low on disk space. One can argue this scenario won't be much frequent. 2. ext4_mb_discard_group_preallocations(), locks and unlocks the rwlock for deleting every individual PA. This gives enough opportunity for the fast path to acquire the read_lock for searching the PA inode list. Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/4137bce8f6948fedd8bae134dabae24acfe699c6.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +- fs/ext4/mballoc.c | 286 +++++++++++++++++++++++++++++++++++++++--------------- fs/ext4/mballoc.h | 6 +- fs/ext4/super.c | 4 +- 4 files changed, 214 insertions(+), 86 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9b2cfc32cf78..8cf955b1dca5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1120,8 +1120,8 @@ struct ext4_inode_info { /* mballoc */ atomic_t i_prealloc_active; - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; + struct rb_root i_prealloc_node; + rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b50c2e5abe67..9b4c6033bd69 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3978,6 +3978,24 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } +/* + * This function returns the next element to look at during inode + * PA rbtree walk. We assume that we have held the inode PA rbtree lock + * (ei->i_prealloc_lock) + * + * new_start The start of the range we want to compare + * cur_start The existing start that we are comparing against + * node The node of the rb_tree + */ +static inline struct rb_node* +ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) +{ + if (new_start < cur_start) + return node->rb_left; + else + return node->rb_right; +} + static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, ext4_lblk_t end) @@ -3986,19 +4004,22 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted == 0) { - tmp_pa_start = tmp_pa->pa_lstart; - tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + tmp_pa_start = tmp_pa->pa_lstart; + tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); - } spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); } /* @@ -4006,60 +4027,140 @@ ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * - *Parameters: + * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, - ext4_lblk_t *start, ext4_lblk_t *end) + ext4_lblk_t *start, ext4_lblk_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_prealloc_space *tmp_pa; + struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; + struct rb_node *iter; ext4_lblk_t new_start, new_end; - ext4_lblk_t tmp_pa_start, tmp_pa_end; + ext4_lblk_t tmp_pa_start, tmp_pa_end, left_pa_end = -1, right_pa_start = -1; new_start = *start; new_end = *end; - /* check we don't cross already preallocated blocks */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { - if (tmp_pa->pa_deleted) - continue; - spin_lock(&tmp_pa->pa_lock); - if (tmp_pa->pa_deleted) { - spin_unlock(&tmp_pa->pa_lock); - continue; - } - + /* + * Adjust the normalized range so that it doesn't overlap with any + * existing preallocated blocks(PAs). Make sure to hold the rbtree lock + * so it doesn't change underneath us. + */ + read_lock(&ei->i_prealloc_lock); + + /* Step 1: find any one immediate neighboring PA of the normalized range */ + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); /* PA must not overlap original request */ - BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || - ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) + BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || + ac->ac_o_ex.fe_logical < tmp_pa_start)); + spin_unlock(&tmp_pa->pa_lock); + } - /* skip PAs this normalized request doesn't overlap with */ - if (tmp_pa_start >= new_end || tmp_pa_end <= new_start) { + /* + * Step 2: check if the found PA is left or right neighbor and + * get the other neighbor + */ + if (tmp_pa) { + if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { + struct rb_node *tmp; + + left_pa = tmp_pa; + tmp = rb_next(&left_pa->pa_node.inode_node); + if (tmp) { + right_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } else { + struct rb_node *tmp; + + right_pa = tmp_pa; + tmp = rb_prev(&right_pa->pa_node.inode_node); + if (tmp) { + left_pa = rb_entry(tmp, + struct ext4_prealloc_space, + pa_node.inode_node); + } + } + } + + /* Step 3: get the non deleted neighbors */ + if (left_pa) { + for (iter = &left_pa->pa_node.inode_node;; + iter = rb_prev(iter)) { + if (!iter) { + left_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + left_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } spin_unlock(&tmp_pa->pa_lock); - continue; } - BUG_ON(tmp_pa_start <= new_start && tmp_pa_end >= new_end); - - /* adjust start or end to be adjacent to this pa */ - if (tmp_pa_end <= ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_end < new_start); - new_start = tmp_pa_end; - } else if (tmp_pa_start > ac->ac_o_ex.fe_logical) { - BUG_ON(tmp_pa_start > new_end); - new_end = tmp_pa_start; + } + + if (right_pa) { + for (iter = &right_pa->pa_node.inode_node;; + iter = rb_next(iter)) { + if (!iter) { + right_pa = NULL; + break; + } + + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); + right_pa = tmp_pa; + spin_lock(&tmp_pa->pa_lock); + if (tmp_pa->pa_deleted == 0) { + spin_unlock(&tmp_pa->pa_lock); + break; + } + spin_unlock(&tmp_pa->pa_lock); } - spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + + if (left_pa) { + left_pa_end = + left_pa->pa_lstart + EXT4_C2B(sbi, left_pa->pa_len); + BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); + } + + if (right_pa) { + right_pa_start = right_pa->pa_lstart; + BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); + } + + /* Step 4: trim our normalized range to not overlap with the neighbors */ + if (left_pa) { + if (left_pa_end > new_start) + new_start = left_pa_end; + } + + if (right_pa) { + if (right_pa_start < new_end) + new_end = right_pa_start; + } + read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); @@ -4401,6 +4502,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa, *cpa = NULL; ext4_lblk_t tmp_pa_start, tmp_pa_end; + struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ @@ -4408,14 +4510,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return false; /* first, try per-file preallocation */ - rcu_read_lock(); - list_for_each_entry_rcu(tmp_pa, &ei->i_prealloc_list, pa_node.inode_list) { + read_lock(&ei->i_prealloc_lock); + for (iter = ei->i_prealloc_node.rb_node; iter; + iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, + tmp_pa_start, iter)) { + tmp_pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); /* all fields in this condition don't change, * so we can skip locking for them */ tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = tmp_pa->pa_lstart + EXT4_C2B(sbi, tmp_pa->pa_len); + /* original request start doesn't lie in this PA */ if (ac->ac_o_ex.fe_logical < tmp_pa_start || ac->ac_o_ex.fe_logical >= tmp_pa_end) continue; @@ -4438,12 +4545,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); ac->ac_criteria = 10; - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); return true; } spin_unlock(&tmp_pa->pa_lock); } - rcu_read_unlock(); + read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) @@ -4596,6 +4703,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, { ext4_group_t grp; ext4_fsblk_t grp_blk; + struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); @@ -4641,16 +4749,41 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { - spin_lock(pa->pa_node_lock.inode_lock); - list_del_rcu(&pa->pa_node.inode_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } +} - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); +static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + struct ext4_prealloc_space *iter_pa, *new_pa; + ext4_lblk_t iter_start, new_start; + + while (*iter) { + iter_pa = rb_entry(*iter, struct ext4_prealloc_space, + pa_node.inode_node); + new_pa = rb_entry(new, struct ext4_prealloc_space, + pa_node.inode_node); + iter_start = iter_pa->pa_lstart; + new_start = new_pa->pa_lstart; + + parent = *iter; + if (new_start < iter_start) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); } /* @@ -4724,7 +4857,6 @@ adjust_bex: pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); - INIT_LIST_HEAD(&pa->pa_node.inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; @@ -4744,9 +4876,9 @@ adjust_bex: list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - spin_lock(pa->pa_node_lock.inode_lock); - list_add_rcu(&pa->pa_node.inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); + write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } @@ -4908,6 +5040,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_prealloc_space *pa, *tmp; struct list_head list; struct ext4_buddy e4b; + struct ext4_inode_info *ei; int err; int free = 0; @@ -4971,18 +5104,21 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { - spin_lock(pa->pa_node_lock.inode_lock); - list_del_rcu(&pa->pa_node.inode_list); - spin_unlock(pa->pa_node_lock.inode_lock); + write_lock(pa->pa_node_lock.inode_lock); + ei = EXT4_I(pa->pa_inode); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); + write_unlock(pa->pa_node_lock.inode_lock); } - if (pa->pa_type == MB_GROUP_PA) + list_del(&pa->u.pa_tmp_list); + + if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); - else + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); - - list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); + } } ext4_unlock_group(sb, group); @@ -5012,6 +5148,7 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; + struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) { @@ -5033,17 +5170,19 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed) repeat: /* first, collect all pa's in the inode */ - spin_lock(&ei->i_prealloc_lock); - while (!list_empty(&ei->i_prealloc_list) && needed) { - pa = list_entry(ei->i_prealloc_list.prev, - struct ext4_prealloc_space, pa_node.inode_list); + write_lock(&ei->i_prealloc_lock); + for (iter = rb_first(&ei->i_prealloc_node); iter && needed; + iter = rb_next(iter)) { + pa = rb_entry(iter, struct ext4_prealloc_space, + pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); + spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); @@ -5054,7 +5193,7 @@ repeat: if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); - list_del_rcu(&pa->pa_node.inode_list); + rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); needed--; continue; @@ -5062,7 +5201,7 @@ repeat: /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from @@ -5079,7 +5218,7 @@ repeat: schedule_timeout_uninterruptible(HZ); goto repeat; } - spin_unlock(&ei->i_prealloc_lock); + write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); @@ -5111,7 +5250,7 @@ repeat: put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); - call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); + ext4_mb_pa_free(pa); } } @@ -5485,7 +5624,6 @@ static void ext4_mb_trim_inode_pa(struct inode *inode) static int ext4_mb_release_context(struct ext4_allocation_context *ac) { struct inode *inode = ac->ac_inode; - struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -5512,16 +5650,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) } } - if (pa->pa_type == MB_INODE_PA) { - /* - * treat per-inode prealloc list as a lru list, then try - * to trim the least recently used PA. - */ - spin_lock(pa->pa_node_lock.inode_lock); - list_move(&pa->pa_node.inode_list, &ei->i_prealloc_list); - spin_unlock(pa->pa_node_lock.inode_lock); - } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_page) diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 398a6688c341..f8e8ee493867 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -115,7 +115,7 @@ struct ext4_free_data { struct ext4_prealloc_space { union { - struct list_head inode_list; /* for inode PAs */ + struct rb_node inode_node; /* for inode PA rbtree */ struct list_head lg_list; /* for lg PAs */ } pa_node; struct list_head pa_group_list; @@ -132,10 +132,10 @@ struct ext4_prealloc_space { ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ union { - spinlock_t *inode_lock; /* locks the inode list holding this PA */ + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ spinlock_t *lg_lock; /* locks the lg list holding this PA */ } pa_node_lock; - struct inode *pa_inode; /* hack, for history only */ + struct inode *pa_inode; /* used to get the inode during group discard */ }; enum { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f226f8ab469b..54fb600bf51f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1325,9 +1325,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) inode_set_iversion(&ei->vfs_inode, 1); ei->i_flags = 0; spin_lock_init(&ei->i_raw_lock); - INIT_LIST_HEAD(&ei->i_prealloc_list); + ei->i_prealloc_node = RB_ROOT; atomic_set(&ei->i_prealloc_active, 0); - spin_lock_init(&ei->i_prealloc_lock); + rwlock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_list); -- cgit v1.2.3 From 361eb69fc99f1a8f1d653d69ecd742f3cbb896be Mon Sep 17 00:00:00 2001 From: Ojaswin Mujoo Date: Sat, 25 Mar 2023 13:43:42 +0530 Subject: ext4: Remove the logic to trim inode PAs Earlier, inode PAs were stored in a linked list. This caused a need to periodically trim the list down inorder to avoid growing it to a very large size, as this would severly affect performance during list iteration. Recent patches changed this list to an rbtree, and since the tree scales up much better, we no longer need to have the trim functionality, hence remove it. Signed-off-by: Ojaswin Mujoo Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/c409addceaa3ade4b40328e28e3b54b2f259689e.1679731817.git.ojaswin@linux.ibm.com Signed-off-by: Theodore Ts'o --- Documentation/admin-guide/ext4.rst | 3 --- fs/ext4/ext4.h | 1 - fs/ext4/mballoc.c | 20 -------------------- fs/ext4/mballoc.h | 5 ----- fs/ext4/sysfs.c | 2 -- 5 files changed, 31 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/ext4.rst b/Documentation/admin-guide/ext4.rst index 4c559e08d11e..5740d85439ff 100644 --- a/Documentation/admin-guide/ext4.rst +++ b/Documentation/admin-guide/ext4.rst @@ -489,9 +489,6 @@ Files in /sys/fs/ext4/: multiple of this tuning parameter if the stripe size is not set in the ext4 superblock - mb_max_inode_prealloc - The maximum length of per-inode ext4_prealloc_space list. - mb_max_to_scan The maximum number of extents the multiblock allocator will search to find the best extent. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8cf955b1dca5..993d3284b430 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1613,7 +1613,6 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; - unsigned int s_mb_max_inode_prealloc; unsigned int s_max_dir_size_kb; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9b4c6033bd69..78259bddbc4d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3417,7 +3417,6 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_max_inode_prealloc = MB_DEFAULT_MAX_INODE_PREALLOC; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file @@ -5601,29 +5600,11 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) return ; } -/* - * if per-inode prealloc list is too long, trim some PA - */ -static void ext4_mb_trim_inode_pa(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int count, delta; - - count = atomic_read(&ei->i_prealloc_active); - delta = (sbi->s_mb_max_inode_prealloc >> 2) + 1; - if (count > sbi->s_mb_max_inode_prealloc + delta) { - count -= sbi->s_mb_max_inode_prealloc; - ext4_discard_preallocations(inode, count); - } -} - /* * release all resource we used in allocation */ static int ext4_mb_release_context(struct ext4_allocation_context *ac) { - struct inode *inode = ac->ac_inode; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { @@ -5659,7 +5640,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); - ext4_mb_trim_inode_pa(inode); return 0; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index f8e8ee493867..6d85ee8674a6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -73,11 +73,6 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 -/* - * maximum length of inode prealloc list - */ -#define MB_DEFAULT_MAX_INODE_PREALLOC 512 - /* * Number of groups to search linearly before performing group scanning * optimization. diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 12d6252e3e22..3042bc605bbf 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -214,7 +214,6 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); -EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); @@ -264,7 +263,6 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), - ATTR_LIST(mb_max_inode_prealloc), ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), -- cgit v1.2.3 From e999a5c5a19cf3b679f3d93c49ad5f5c04e4806c Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:01 +0000 Subject: fs: Add FGP_WRITEBEGIN This particular combination of flags is used by most filesystems in their ->write_begin method, although it does find use in a few other places. Before folios, it warranted its own function (grab_cache_page_write_begin()), but I think that just having specialised flags is enough. It certainly helps the few places that have been converted from grab_cache_page_write_begin() to __filemap_get_folio(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-2-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 5 ++--- fs/iomap/buffered-io.c | 2 +- fs/netfs/buffered_read.c | 3 +-- fs/nfs/file.c | 12 ++---------- include/linux/pagemap.h | 2 ++ mm/folio-compat.c | 4 +--- 6 files changed, 9 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 2de9829aed63..0cb361f0a4fe 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -126,7 +126,6 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, { struct address_space *mapping[2]; unsigned int flags; - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { @@ -139,14 +138,14 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, } flags = memalloc_nofs_save(); - folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, + folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); if (!folio[0]) { memalloc_nofs_restore(flags); return -ENOMEM; } - folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, + folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (!folio[1]) { diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6f4c97a6d7e9..10a203515583 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -467,7 +467,7 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); */ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) { - unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; + unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; struct folio *folio; if (iter->flags & IOMAP_NOWAIT) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 7679a68e8193..e3d754a9e1b0 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -341,14 +341,13 @@ int netfs_write_begin(struct netfs_inode *ctx, { struct netfs_io_request *rreq; struct folio *folio; - unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; pgoff_t index = pos >> PAGE_SHIFT; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - folio = __filemap_get_folio(mapping, index, fgp_flags, + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (!folio) return -ENOMEM; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 893625eacab9..2474cbc30712 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -306,15 +306,6 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, return false; } -static struct folio * -nfs_folio_grab_cache_write_begin(struct address_space *mapping, pgoff_t index) -{ - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - - return __filemap_get_folio(mapping, index, fgp_flags, - mapping_gfp_mask(mapping)); -} - /* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the @@ -335,7 +326,8 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file, mapping->host->i_ino, len, (long long) pos); start: - folio = nfs_folio_grab_cache_write_begin(mapping, pos >> PAGE_SHIFT); + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); if (!folio) return -ENOMEM; *pagep = &folio->page; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0acb8e1fb7af..51b75b89730e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -507,6 +507,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping, #define FGP_ENTRY 0x00000080 #define FGP_STABLE 0x00000100 +#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE) + struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, int fgp_flags, gfp_t gfp); struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, diff --git a/mm/folio-compat.c b/mm/folio-compat.c index cabcd1de9ecb..a71523a06ccd 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -106,9 +106,7 @@ EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index) { - unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - - return pagecache_get_page(mapping, index, fgp_flags, + return pagecache_get_page(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(grab_cache_page_write_begin); -- cgit v1.2.3 From cd57b77197a434709aec0e7fb8b2e6ec8479aa4e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:03 +0000 Subject: ext4: Convert ext4_bio_write_page() to use a folio Remove several calls to compound_head() and the last caller of set_page_writeback_keepwrite(), so remove the wrapper too. Also export bio_add_folio() as this is the first caller from a module. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-4-willy@infradead.org Signed-off-by: Theodore Ts'o --- block/bio.c | 1 + fs/ext4/page-io.c | 58 +++++++++++++++++++++------------------------- include/linux/page-flags.h | 5 ---- 3 files changed, 28 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/block/bio.c b/block/bio.c index fd11614bba4d..043944fd46eb 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1159,6 +1159,7 @@ bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, return false; return bio_add_page(bio, &folio->page, len, off) > 0; } +EXPORT_SYMBOL(bio_add_folio); void __bio_release_pages(struct bio *bio, bool mark_dirty) { diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 8703fd732abb..7850d2cb2e08 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -409,12 +409,10 @@ static void io_submit_init_bio(struct ext4_io_submit *io, static void io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, - struct page *pagecache_page, - struct page *bounce_page, + struct folio *folio, + struct folio *io_folio, struct buffer_head *bh) { - int ret; - if (io->io_bio && (bh->b_blocknr != io->io_next_block || !fscrypt_mergeable_bio_bh(io->io_bio, bh))) { submit_and_retry: @@ -422,11 +420,9 @@ submit_and_retry: } if (io->io_bio == NULL) io_submit_init_bio(io, bh); - ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page, - bh->b_size, bh_offset(bh)); - if (ret != bh->b_size) + if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh))) goto submit_and_retry; - wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size); + wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size); io->io_next_block++; } @@ -434,8 +430,9 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct page *page, int len) { - struct page *bounce_page = NULL; - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct folio *io_folio = folio; + struct inode *inode = folio->mapping->host; unsigned block_start; struct buffer_head *bh, *head; int ret = 0; @@ -443,30 +440,30 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc = io->io_wbc; bool keep_towrite = false; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - ClearPageError(page); + folio_clear_error(folio); /* * Comments copied from block_write_full_page: * - * The page straddles i_size. It must be zeroed out on each and every + * The folio straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); + if (len < folio_size(folio)) + folio_zero_segment(folio, len, folio_size(folio)); /* * In the first loop we prepare and mark buffers to submit. We have to - * mark all buffers in the page before submitting so that - * end_page_writeback() cannot be called from ext4_end_bio() when IO + * mark all buffers in the folio before submitting so that + * folio_end_writeback() cannot be called from ext4_end_bio() when IO * on the first buffer finishes and we are still working on submitting * the second buffer. */ - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_start = bh_offset(bh); if (block_start >= len) { @@ -481,14 +478,14 @@ int ext4_bio_write_page(struct ext4_io_submit *io, clear_buffer_dirty(bh); /* * Keeping dirty some buffer we cannot write? Make sure - * to redirty the page and keep TOWRITE tag so that - * racing WB_SYNC_ALL writeback does not skip the page. + * to redirty the folio and keep TOWRITE tag so that + * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for * transaction commit. */ if (buffer_dirty(bh)) { - if (!PageDirty(page)) - redirty_page_for_writepage(wbc, page); + if (!folio_test_dirty(folio)) + folio_redirty_for_writepage(wbc, folio); keep_towrite = true; } continue; @@ -500,11 +497,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, nr_to_submit++; } while ((bh = bh->b_this_page) != head); - /* Nothing to submit? Just unlock the page... */ + /* Nothing to submit? Just unlock the folio... */ if (!nr_to_submit) return 0; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* * If any blocks are being written to an encrypted file, encrypt them @@ -516,6 +513,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); + struct page *bounce_page; /* * Since bounce page allocation uses a mempool, we can only use @@ -542,7 +540,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); do { if (buffer_async_write(bh)) { clear_buffer_async_write(bh); @@ -553,18 +551,16 @@ int ext4_bio_write_page(struct ext4_io_submit *io, return ret; } + io_folio = page_folio(bounce_page); } - if (keep_towrite) - set_page_writeback_keepwrite(page); - else - set_page_writeback(page); + __folio_start_writeback(folio, keep_towrite); /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - io_submit_add_bh(io, inode, page, bounce_page, bh); + io_submit_add_bh(io, inode, folio, io_folio, bh); } while ((bh = bh->b_this_page) != head); return 0; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index a7e3a3405520..2e00cc14a81c 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -766,11 +766,6 @@ bool set_page_writeback(struct page *page); #define folio_start_writeback_keepwrite(folio) \ __folio_start_writeback(folio, true) -static inline void set_page_writeback_keepwrite(struct page *page) -{ - folio_start_writeback_keepwrite(page_folio(page)); -} - static inline bool test_set_page_writeback(struct page *page) { return set_page_writeback(page); -- cgit v1.2.3 From bb64c08bff6a6edbd85786c92a2cb980ed99b29f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:04 +0000 Subject: ext4: Convert ext4_finish_bio() to use folios Prepare ext4 to support large folios in the page writeback path. Also set the actual error in the mapping, not just -EIO. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-5-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 7850d2cb2e08..f0144ef39bb1 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -99,30 +99,30 @@ static void buffer_io_error(struct buffer_head *bh) static void ext4_finish_bio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct page *bounce_page = NULL; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct folio *io_folio = NULL; struct buffer_head *bh, *head; - unsigned bio_start = bvec->bv_offset; - unsigned bio_end = bio_start + bvec->bv_len; + size_t bio_start = fi.offset; + size_t bio_end = bio_start + fi.length; unsigned under_io = 0; unsigned long flags; - if (fscrypt_is_bounce_page(page)) { - bounce_page = page; - page = fscrypt_pagecache_page(bounce_page); + if (fscrypt_is_bounce_folio(folio)) { + io_folio = folio; + folio = fscrypt_pagecache_folio(folio); } if (bio->bi_status) { - SetPageError(page); - mapping_set_error(page->mapping, -EIO); + int err = blk_status_to_errno(bio->bi_status); + folio_set_error(folio); + mapping_set_error(folio->mapping, err); } - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); /* - * We check all buffers in the page under b_uptodate_lock + * We check all buffers in the folio under b_uptodate_lock * to avoid races with other end io clearing async_write flags */ spin_lock_irqsave(&head->b_uptodate_lock, flags); @@ -141,8 +141,8 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); spin_unlock_irqrestore(&head->b_uptodate_lock, flags); if (!under_io) { - fscrypt_free_bounce_page(bounce_page); - end_page_writeback(page); + fscrypt_free_bounce_page(&io_folio->page); + folio_end_writeback(folio); } } } -- cgit v1.2.3 From 4da2f6e3c45999e904de1edcd06c8533715cc1b5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:05 +0000 Subject: ext4: Turn mpage_process_page() into mpage_process_folio() The page/folio is only used to extract the buffers, so this is a simple change. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-6-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dbcc8b48c7ba..398b0e505300 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2022,21 +2022,22 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } /* - * mpage_process_page - update page buffers corresponding to changed extent and - * may submit fully mapped page for IO - * - * @mpd - description of extent to map, on return next extent to map - * @m_lblk - logical block mapping. - * @m_pblk - corresponding physical mapping. - * @map_bh - determines on return whether this page requires any further + * mpage_process_folio - update folio buffers corresponding to changed extent + * and may submit fully mapped page for IO + * @mpd: description of extent to map, on return next extent to map + * @folio: Contains these buffers. + * @m_lblk: logical block mapping. + * @m_pblk: corresponding physical mapping. + * @map_bh: determines on return whether this page requires any further * mapping or not. - * Scan given page buffers corresponding to changed extent and update buffer + * + * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. - * If the given page is not fully mapped, we update @map to the next extent in - * the given page that needs mapping & return @map_bh as true. + * If the given folio is not fully mapped, we update @mpd to the next extent in + * the given folio that needs mapping & return @map_bh as true. */ -static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, +static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { @@ -2049,14 +2050,14 @@ static int mpage_process_page(struct mpage_da_data *mpd, struct page *page, ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. - * Find next buffer in the page to map. + * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; @@ -2129,9 +2130,9 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (nr == 0) break; for (i = 0; i < nr; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - err = mpage_process_page(mpd, page, &lblk, &pblock, + err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh @@ -2141,10 +2142,10 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, page); + err = mpage_submit_page(mpd, &folio->page); if (err < 0) goto out; - mpage_page_done(mpd, page); + mpage_page_done(mpd, &folio->page); } folio_batch_release(&fbatch); } -- cgit v1.2.3 From 81a0d3e126a0bb4300d1db259d89b839124f2cff Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:06 +0000 Subject: ext4: Convert mpage_submit_page() to mpage_submit_folio() All callers now have a folio so we can pass one in and use the folio APIs to support large folios as well as save instructions by eliminating calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-7-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 398b0e505300..dcb852e7e1cc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1869,34 +1869,33 @@ static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) unlock_page(page); } -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { - int len; + size_t len; loff_t size; int err; - BUG_ON(page->index != mpd->first_page); - clear_page_dirty_for_io(page); + BUG_ON(folio->index != mpd->first_page); + folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing - * data through mmap while writeback runs. clear_page_dirty_for_io() + * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get - * written to again until we release page lock. So only after - * clear_page_dirty_for_io() we are safe to sample i_size for + * written to again until we release folio lock. So only after + * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_page() to zero-out tail of the written page. We rely * on the barrier provided by TestClearPageDirty in - * clear_page_dirty_for_io() to make sure i_size is really sampled only + * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); - if (page->index == size >> PAGE_SHIFT && + len = folio_size(folio); + if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; - err = ext4_bio_write_page(&mpd->io_submit, page, len); + err = ext4_bio_write_page(&mpd->io_submit, &folio->page, len); if (!err) mpd->wbc->nr_to_write--; @@ -2009,7 +2008,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { - err = mpage_submit_page(mpd, head->b_page); + err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_page_done(mpd, head->b_page); @@ -2142,7 +2141,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_page_done(mpd, &folio->page); @@ -2532,12 +2531,12 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (ext4_page_nomap_can_writeout(&folio->page)) { WARN_ON_ONCE(sb->s_writers.frozen == SB_FREEZE_COMPLETE); - err = mpage_submit_page(mpd, &folio->page); + err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; } /* Pending dirtying of journalled data? */ - if (PageChecked(&folio->page)) { + if (folio_test_checked(folio)) { WARN_ON_ONCE(sb->s_writers.frozen >= SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, -- cgit v1.2.3 From 33483b3b6ee4328f37c3dcf702ba979e6a00bf8f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:07 +0000 Subject: ext4: Convert mpage_page_done() to mpage_folio_done() All callers now have a folio so we can pass one in and use the folio APIs to support large folios as well as save instructions by eliminating a call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-8-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dcb852e7e1cc..916c923a25b8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1863,10 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) +static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { - mpd->first_page++; - unlock_page(page); + mpd->first_page += folio_nr_pages(folio); + folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) @@ -2011,7 +2011,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; - mpage_page_done(mpd, head->b_page); + mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; @@ -2144,7 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; - mpage_page_done(mpd, &folio->page); + mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } @@ -2544,7 +2544,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (err < 0) goto out; } - mpage_page_done(mpd, &folio->page); + mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << -- cgit v1.2.3 From e8d6062c50acbf1aba88ca6adaa1bcda058abeab Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:08 +0000 Subject: ext4: Convert ext4_bio_write_page() to ext4_bio_write_folio() The only caller now has a folio so pass it in directly and avoid the call to page_folio() at the beginning. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-9-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++--- fs/ext4/inode.c | 6 +++--- fs/ext4/page-io.c | 10 ++++------ 3 files changed, 9 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 993d3284b430..3535338caf0d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3756,9 +3756,8 @@ extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); -extern int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len); +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, + size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 916c923a25b8..2be604af7aec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1885,8 +1885,8 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for - * ext4_bio_write_page() to zero-out tail of the written page. We rely - * on the barrier provided by TestClearPageDirty in + * ext4_bio_write_folio() to zero-out tail of the written page. We rely + * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ @@ -1895,7 +1895,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; - err = ext4_bio_write_page(&mpd->io_submit, &folio->page, len); + err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index f0144ef39bb1..8fe1875b0a42 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -426,11 +426,9 @@ submit_and_retry: io->io_next_block++; } -int ext4_bio_write_page(struct ext4_io_submit *io, - struct page *page, - int len) +int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, + size_t len) { - struct folio *folio = page_folio(page); struct folio *io_folio = folio; struct inode *inode = folio->mapping->host; unsigned block_start; @@ -523,8 +521,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(page, enc_bytes, - 0, gfp_flags); + bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && -- cgit v1.2.3 From 3edde93e07954a8860d67be4a2165514a083b6e8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:09 +0000 Subject: ext4: Convert ext4_readpage_inline() to take a folio Use the folio API in this function, saves a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-10-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/inline.c | 14 +++++++------- fs/ext4/inode.c | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3535338caf0d..6b21b3aa92d6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3549,7 +3549,7 @@ extern int ext4_init_inline_data(handle_t *handle, struct inode *inode, unsigned int len); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); -extern int ext4_readpage_inline(struct inode *inode, struct page *page); +int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 1602d74b5eeb..e9bae3002319 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -501,7 +501,7 @@ out: return ret; } -int ext4_readpage_inline(struct inode *inode, struct page *page) +int ext4_readpage_inline(struct inode *inode, struct folio *folio) { int ret = 0; @@ -515,16 +515,16 @@ int ext4_readpage_inline(struct inode *inode, struct page *page) * Current inline data can only exist in the 1st page, * So for all the other pages, just set them uptodate. */ - if (!page->index) - ret = ext4_read_inline_page(inode, page); - else if (!PageUptodate(page)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); + if (!folio->index) + ret = ext4_read_inline_page(inode, &folio->page); + else if (!folio_test_uptodate(folio)) { + folio_zero_segment(folio, 0, folio_size(folio)); + folio_mark_uptodate(folio); } up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); + folio_unlock(folio); return ret >= 0 ? 0 : ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2be604af7aec..81bcc73f610f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3155,7 +3155,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio) trace_ext4_readpage(page); if (ext4_has_inline_data(inode)) - ret = ext4_readpage_inline(inode, page); + ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, page); -- cgit v1.2.3 From 83eba701cf6e582afa92987e34abc0b0dbcb690e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:10 +0000 Subject: ext4: Convert ext4_convert_inline_data_to_extent() to use a folio Saves a number of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-11-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index e9bae3002319..f339340ba66c 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -534,8 +534,7 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping, int ret, needed_blocks, no_expand; handle_t *handle = NULL; int retries = 0, sem_held = 0; - struct page *page = NULL; - unsigned int flags; + struct folio *folio = NULL; unsigned from, to; struct ext4_iloc iloc; @@ -564,10 +563,9 @@ retry: /* We cannot recurse into the filesystem as the transaction is already * started */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (!folio) { ret = -ENOMEM; goto out; } @@ -582,8 +580,8 @@ retry: from = 0; to = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_page(inode, &folio->page); if (ret < 0) goto out; } @@ -593,21 +591,21 @@ retry: goto out; if (ext4_should_dioread_nolock(inode)) { - ret = __block_write_begin(page, from, to, + ret = __block_write_begin(&folio->page, from, to, ext4_get_block_unwritten); } else - ret = __block_write_begin(page, from, to, ext4_get_block); + ret = __block_write_begin(&folio->page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), - from, to, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); + folio = NULL; ext4_orphan_add(handle, inode); ext4_write_unlock_xattr(inode, &no_expand); sem_held = 0; @@ -627,12 +625,12 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - if (page) - block_commit_write(page, from, to); + if (folio) + block_commit_write(&folio->page, from, to); out: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } if (sem_held) ext4_write_unlock_xattr(inode, &no_expand); -- cgit v1.2.3 From f8f8c89f59f7ab037bfca8797e2cc613a5684f21 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:11 +0000 Subject: ext4: Convert ext4_try_to_write_inline_data() to use a folio Saves a number of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-12-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f339340ba66c..881d559c503f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -653,8 +653,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, { int ret; handle_t *handle; - unsigned int flags; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; if (pos + len > ext4_get_max_inline_size(inode)) @@ -691,28 +690,27 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, if (ret) goto out; - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (!folio) { ret = -ENOMEM; goto out; } - *pagep = page; + *pagep = &folio->page; down_read(&EXT4_I(inode)->xattr_sem); if (!ext4_has_inline_data(inode)) { ret = 0; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_page(inode, &folio->page); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out_up_read; } } -- cgit v1.2.3 From 4ed9b598ac30913987ab46e0069620e6e8af82f0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:12 +0000 Subject: ext4: Convert ext4_da_convert_inline_data_to_extent() to use a folio Saves a number of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-13-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 881d559c503f..45d74274d822 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -848,10 +848,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, void **fsdata) { int ret = 0, inline_size; - struct page *page; + struct folio *folio; - page = grab_cache_page_write_begin(mapping, 0); - if (!page) + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; down_read(&EXT4_I(inode)->xattr_sem); @@ -862,32 +863,32 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, inline_size = ext4_get_inline_size(inode); - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_page(inode, &folio->page); if (ret < 0) goto out; } - ret = __block_write_begin(page, 0, inline_size, + ret = __block_write_begin(&folio->page, 0, inline_size, ext4_da_get_block_prep); if (ret) { up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_truncate_failed_write(inode); return ret; } - SetPageDirty(page); - SetPageUptodate(page); + folio_mark_dirty(folio); + folio_mark_uptodate(folio); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); *fsdata = (void *)CONVERT_INLINE_DATA; out: up_read(&EXT4_I(inode)->xattr_sem); - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } return ret; } -- cgit v1.2.3 From 9a9d01f081ea29a5a8afc4504b1bc48daffa5cc1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:13 +0000 Subject: ext4: Convert ext4_da_write_inline_data_begin() to use a folio Saves a number of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-14-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 45d74274d822..2fa6c51baef9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -909,10 +909,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping, { int ret; handle_t *handle; - struct page *page; + struct folio *folio; struct ext4_iloc iloc; int retries = 0; - unsigned int flags; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -944,10 +943,9 @@ retry_journal: * We cannot recurse into the filesystem as the transaction * is already started. */ - flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, 0); - memalloc_nofs_restore(flags); - if (!page) { + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, + mapping_gfp_mask(mapping)); + if (!folio) { ret = -ENOMEM; goto out_journal; } @@ -958,8 +956,8 @@ retry_journal: goto out_release_page; } - if (!PageUptodate(page)) { - ret = ext4_read_inline_page(inode, page); + if (!folio_test_uptodate(folio)) { + ret = ext4_read_inline_page(inode, &folio->page); if (ret < 0) goto out_release_page; } @@ -969,13 +967,13 @@ retry_journal: goto out_release_page; up_read(&EXT4_I(inode)->xattr_sem); - *pagep = page; + *pagep = &folio->page; brelse(iloc.bh); return 1; out_release_page: up_read(&EXT4_I(inode)->xattr_sem); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_journal: ext4_journal_stop(handle); out: -- cgit v1.2.3 From 6b87fbe4155007c3ab8e950c72db657f6cd990c6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:14 +0000 Subject: ext4: Convert ext4_read_inline_page() to ext4_read_inline_folio() All callers now have a folio, so pass it and use it. The folio may be large, although I doubt we'll want to use a large folio for an inline file. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-15-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2fa6c51baef9..4c819b6c70c1 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -467,16 +467,16 @@ out: return error; } -static int ext4_read_inline_page(struct inode *inode, struct page *page) +static int ext4_read_inline_folio(struct inode *inode, struct folio *folio) { void *kaddr; int ret = 0; size_t len; struct ext4_iloc iloc; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(!ext4_has_inline_data(inode)); - BUG_ON(page->index); + BUG_ON(folio->index); if (!EXT4_I(inode)->i_inline_off) { ext4_warning(inode->i_sb, "inode %lu doesn't have inline data.", @@ -489,12 +489,13 @@ static int ext4_read_inline_page(struct inode *inode, struct page *page) goto out; len = min_t(size_t, ext4_get_inline_size(inode), i_size_read(inode)); - kaddr = kmap_atomic(page); + BUG_ON(len > PAGE_SIZE); + kaddr = kmap_local_folio(folio, 0); ret = ext4_read_inline_data(inode, kaddr, len, &iloc); - flush_dcache_page(page); - kunmap_atomic(kaddr); - zero_user_segment(page, len, PAGE_SIZE); - SetPageUptodate(page); + flush_dcache_folio(folio); + kunmap_local(kaddr); + folio_zero_segment(folio, len, folio_size(folio)); + folio_mark_uptodate(folio); brelse(iloc.bh); out: @@ -516,7 +517,7 @@ int ext4_readpage_inline(struct inode *inode, struct folio *folio) * So for all the other pages, just set them uptodate. */ if (!folio->index) - ret = ext4_read_inline_page(inode, &folio->page); + ret = ext4_read_inline_folio(inode, folio); else if (!folio_test_uptodate(folio)) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); @@ -581,7 +582,7 @@ retry: from = 0; to = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_page(inode, &folio->page); + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } @@ -707,7 +708,7 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, } if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_page(inode, &folio->page); + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); @@ -864,7 +865,7 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, inline_size = ext4_get_inline_size(inode); if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_page(inode, &folio->page); + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out; } @@ -957,7 +958,7 @@ retry_journal: } if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_page(inode, &folio->page); + ret = ext4_read_inline_folio(inode, folio); if (ret < 0) goto out_release_page; } -- cgit v1.2.3 From 6b90d4130ac8ee9cf2a179a617cfced71a18d252 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:15 +0000 Subject: ext4: Convert ext4_write_inline_data_end() to use a folio Convert the incoming page to a folio so that we call compound_head() only once instead of seven times. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-16-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inline.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 4c819b6c70c1..b9fb1177fff6 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -732,20 +732,21 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); int no_expand; void *kaddr; struct ext4_iloc iloc; int ret = 0, ret2; - if (unlikely(copied < len) && !PageUptodate(page)) + if (unlikely(copied < len) && !folio_test_uptodate(folio)) copied = 0; if (likely(copied)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ext4_std_error(inode->i_sb, ret); goto out; } @@ -759,30 +760,30 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, */ (void) ext4_find_inline_data_nolock(inode); - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + kunmap_local(kaddr); + folio_mark_uptodate(folio); + /* clear dirty flag so that writepages wouldn't work for us. */ + folio_clear_dirty(folio); ext4_write_unlock_xattr(inode, &no_expand); brelse(iloc.bh); /* - * It's important to update i_size while still holding page + * It's important to update i_size while still holding folio * lock: page writeout could otherwise come in and zero * beyond i_size. */ ext4_update_inode_size(inode, pos + copied); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (likely(copied)) -- cgit v1.2.3 From 4d934a5e6caa6dcdd3fbee7b96fe512a455863b6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:16 +0000 Subject: ext4: Convert ext4_write_begin() to use a folio Remove a lot of calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-17-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 81bcc73f610f..d499417ffce7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1139,7 +1139,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, int ret, needed_blocks; handle_t *handle; int retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; unsigned from, to; @@ -1166,68 +1166,69 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, } /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page + * __filemap_get_folio() can take a long time if the + * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. + * the folio (if needed) without using GFP_NOFS. */ retry_grab: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ - if (!page_has_buffers(page)) - create_empty_buffers(page, inode->i_sb->s_blocksize, 0); + if (!folio_buffers(folio)) + create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); - unlock_page(page); + folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { - put_page(page); + folio_put(folio); return PTR_ERR(handle); } - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); + folio_lock(folio); + if (folio->mapping != mapping) { + /* The folio got truncated from under us */ + folio_unlock(folio); + folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) - ret = __block_write_begin(page, pos, len, + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else - ret = __block_write_begin(page, pos, len, ext4_get_block); + ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, - page_buffers(page), from, to, NULL, - do_journal_get_write_access); + folio_buffers(folio), from, to, + NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); - unlock_page(page); + folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -1255,10 +1256,10 @@ retry_journal: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; - put_page(page); + folio_put(folio); return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } -- cgit v1.2.3 From 64fb31367598188a0a230b81c6f4397fa71fd033 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:17 +0000 Subject: ext4: Convert ext4_write_end() to use a folio Convert the incoming struct page to a folio. Replaces two implicit calls to compound_head() with one explicit call. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-18-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d499417ffce7..141e050de960 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1289,6 +1289,7 @@ static int ext4_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1304,7 +1305,7 @@ static int ext4_write_end(struct file *file, copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* - * it's important to update i_size while still holding page lock: + * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree @@ -1312,15 +1313,15 @@ static int ext4_write_end(struct file *file, */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling + * Don't mark the inode dirty under folio lock. First, it unnecessarily + * makes the holding time of folio lock longer. Second, it forces lock + * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) -- cgit v1.2.3 From feb22b77b855a6529675b4e998970ab461c0f446 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:18 +0000 Subject: ext4: Use a folio in ext4_journalled_write_end() Convert the incoming page to a folio to remove a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-19-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 141e050de960..9cf3daa11534 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1392,6 +1392,7 @@ static int ext4_journalled_write_end(struct file *file, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; @@ -1410,25 +1411,26 @@ static int ext4_journalled_write_end(struct file *file, if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, page); - if (unlikely(copied < len) && !PageUptodate(page)) { + if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); - ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, + folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); -- cgit v1.2.3 From 86324a21627a40f949bf787b55c45b9856523f9d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:19 +0000 Subject: ext4: Convert ext4_journalled_zero_new_buffers() to use a folio Remove a call to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-20-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9cf3daa11534..8d3b0742428d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1359,24 +1359,24 @@ static int ext4_write_end(struct file *file, */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, - struct page *page, + struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; - zero_user(page, start, size); + folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); @@ -1413,10 +1413,11 @@ static int ext4_journalled_write_end(struct file *file, if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, inode, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, folio, + from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, inode, page, + ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), -- cgit v1.2.3 From 9d3973de9a3745ea9d38bdfb953a4c4bee81ac2a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:20 +0000 Subject: ext4: Convert __ext4_block_zero_page_range() to use a folio Use folio APIs throughout. Saves many calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230324180129.1220691-21-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d3b0742428d..bf646fc76e9d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3669,23 +3669,26 @@ static int __ext4_block_zero_page_range(handle_t *handle, ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; - struct page *page; + struct folio *folio; int err = 0; - page = find_or_create_page(mapping, from >> PAGE_SHIFT, - mapping_gfp_constraint(mapping, ~__GFP_FS)); - if (!page) + folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (!folio) return -ENOMEM; blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + bh = folio_buffers(folio); + if (!bh) { + create_empty_buffers(&folio->page, blocksize, 0); + bh = folio_buffers(folio); + } /* Find the buffer that contains "offset" */ - bh = page_buffers(page); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3707,7 +3710,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, } /* Ok, it's mapped. Make sure it's up-to-date */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { @@ -3717,7 +3720,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); - err = fscrypt_decrypt_pagecache_blocks(page_folio(page), + err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { @@ -3733,7 +3736,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, if (err) goto unlock; } - zero_user(page, offset, length); + folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { @@ -3747,8 +3750,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return err; } -- cgit v1.2.3 From 02e4b04c56d03a518b958783900b22f33c6643d6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:21 +0000 Subject: ext4: Convert ext4_page_nomap_can_writeout to ext4_folio_nomap_can_writeout Its one caller already uses a folio. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Theodore Ts'o Link: https://lore.kernel.org/r/20230324180129.1220691-22-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf646fc76e9d..bf1a19c09b5d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2335,12 +2335,12 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the page needs to be written as part of transaction commit */ -static bool ext4_page_nomap_can_writeout(struct page *page) +/* Return true if the folio needs to be written as part of transaction commit */ +static bool ext4_folio_nomap_can_writeout(struct folio *folio) { struct buffer_head *bh, *head; - bh = head = page_buffers(page); + bh = head = folio_buffers(folio); do { if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) return true; @@ -2533,7 +2533,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * range operations before discarding the page cache. */ if (!mpd->can_map) { - if (ext4_page_nomap_can_writeout(&folio->page)) { + if (ext4_folio_nomap_can_writeout(folio)) { WARN_ON_ONCE(sb->s_writers.frozen == SB_FREEZE_COMPLETE); err = mpage_submit_folio(mpd, folio); -- cgit v1.2.3 From 0b5a254395dc6db5c38d89e606c0298ed4c9e984 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:22 +0000 Subject: ext4: Use a folio in ext4_da_write_begin() Remove a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-23-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bf1a19c09b5d..9e8afac5c82e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2902,7 +2902,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret, retries = 0; - struct page *page; + struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; @@ -2929,22 +2929,23 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; - /* In case writeback began while the page was unlocked */ - wait_for_stable_page(page); + /* In case writeback began while the folio was unlocked */ + folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(page, pos, len, + ret = ext4_block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #else - ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); + ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need @@ -2959,7 +2960,7 @@ retry: return ret; } - *pagep = page; + *pagep = &folio->page; return ret; } -- cgit v1.2.3 From c0be8e6f081b3e966e21f52679b2f809b7df10b8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:23 +0000 Subject: ext4: Convert ext4_mpage_readpages() to work on folios This definitely doesn't include support for large folios; there are all kinds of assumptions about the number of buffers attached to a folio. But it does remove several calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-24-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 +- fs/ext4/inode.c | 7 +++---- fs/ext4/readpage.c | 58 ++++++++++++++++++++++++++---------------------------- 3 files changed, 32 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6b21b3aa92d6..83f0cc02250f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3646,7 +3646,7 @@ static inline void ext4_set_de_type(struct super_block *sb, /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page); + struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9e8afac5c82e..17a6607bcaf2 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3154,17 +3154,16 @@ out: static int ext4_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; int ret = -EAGAIN; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; - trace_ext4_readpage(page); + trace_ext4_readpage(&folio->page); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) - return ext4_mpage_readpages(inode, NULL, page); + return ext4_mpage_readpages(inode, NULL, folio); return ret; } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index c61dc8a7c014..fed4ddb652df 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -218,7 +218,7 @@ static inline loff_t ext4_readpage_limit(struct inode *inode) } int ext4_mpage_readpages(struct inode *inode, - struct readahead_control *rac, struct page *page) + struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; @@ -247,16 +247,15 @@ int ext4_mpage_readpages(struct inode *inode, int fully_mapped = 1; unsigned first_hole = blocks_per_page; - if (rac) { - page = readahead_page(rac); - prefetchw(&page->flags); - } + if (rac) + folio = readahead_folio(rac); + prefetchw(&folio->flags); - if (page_has_buffers(page)) + if (folio_buffers(folio)) goto confused; block_in_file = next_block = - (sector_t)page->index << (PAGE_SHIFT - blkbits); + (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; @@ -290,7 +289,7 @@ int ext4_mpage_readpages(struct inode *inode, /* * Then do more ext4_map_blocks() calls until we are - * done with this page. + * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { @@ -299,10 +298,10 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: - SetPageError(page); - zero_user_segment(page, 0, - PAGE_SIZE); - unlock_page(page); + folio_set_error(folio); + folio_zero_segment(folio, 0, + folio_size(folio)); + folio_unlock(folio); goto next_page; } } @@ -333,22 +332,22 @@ int ext4_mpage_readpages(struct inode *inode, } } if (first_hole != blocks_per_page) { - zero_user_segment(page, first_hole << blkbits, - PAGE_SIZE); + folio_zero_segment(folio, first_hole << blkbits, + folio_size(folio)); if (first_hole == 0) { - if (ext4_need_verity(inode, page->index) && - !fsverity_verify_page(page)) + if (ext4_need_verity(inode, folio->index) && + !fsverity_verify_page(&folio->page)) goto set_error_page; - SetPageUptodate(page); - unlock_page(page); - goto next_page; + folio_mark_uptodate(folio); + folio_unlock(folio); + continue; } } else if (fully_mapped) { - SetPageMappedToDisk(page); + folio_set_mappedtodisk(folio); } /* - * This page will go to BIO. Do we need to send this + * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || @@ -366,7 +365,7 @@ int ext4_mpage_readpages(struct inode *inode, REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); - ext4_set_bio_post_read_ctx(bio, inode, page->index); + ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) @@ -374,7 +373,7 @@ int ext4_mpage_readpages(struct inode *inode, } length = first_hole << blkbits; - if (bio_add_page(bio, page, length, 0) < length) + if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && @@ -384,19 +383,18 @@ int ext4_mpage_readpages(struct inode *inode, bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; - goto next_page; + continue; confused: if (bio) { submit_bio(bio); bio = NULL; } - if (!PageUptodate(page)) - block_read_full_folio(page_folio(page), ext4_get_block); + if (!folio_test_uptodate(folio)) + block_read_full_folio(folio, ext4_get_block); else - unlock_page(page); - next_page: - if (rac) - put_page(page); + folio_unlock(folio); +next_page: + ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); -- cgit v1.2.3 From 86b38c273cc68ce7b50649447d8ac0ddf3228026 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:24 +0000 Subject: ext4: Convert ext4_block_write_begin() to take a folio All the callers now have a folio, so pass that in and operate on folios. Removes four calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ritesh Harjani (IBM) Link: https://lore.kernel.org/r/20230324180129.1220691-25-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 17a6607bcaf2..b6e556c71063 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1030,12 +1030,12 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, } #ifdef CONFIG_FS_ENCRYPTION -static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, +static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; @@ -1045,22 +1045,24 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, int nr_wait = 0; int i; - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - head = page_buffers(page); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } bbits = ilog2(blocksize); - block = (sector_t)page->index << (PAGE_SHIFT - bbits); + block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; @@ -1073,19 +1075,20 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) - zero_user_segments(page, to, block_end, - block_start, from); + folio_zero_segments(folio, to, + block_end, + block_start, from); continue; } } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } @@ -1105,14 +1108,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, err = -EIO; } if (unlikely(err)) { - page_zero_new_buffers(page, from, to); + page_zero_new_buffers(&folio->page, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; - err2 = fscrypt_decrypt_pagecache_blocks(page_folio(page), - blocksize, - bh_offset(wait[i])); + err2 = fscrypt_decrypt_pagecache_blocks(folio, + blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; @@ -1206,11 +1208,10 @@ retry_journal: #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) - ret = ext4_block_write_begin(&folio->page, pos, len, + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else - ret = ext4_block_write_begin(&folio->page, pos, len, - ext4_get_block); + ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(&folio->page, pos, len, @@ -2938,8 +2939,7 @@ retry: folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION - ret = ext4_block_write_begin(&folio->page, pos, len, - ext4_da_get_block_prep); + ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif -- cgit v1.2.3 From 9ea0e45bd2f6cbfba787360f5ba8e18deabb7671 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:25 +0000 Subject: ext4: Use a folio in ext4_page_mkwrite() Convert to the folio API, saving a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-26-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b6e556c71063..6d628d6c0847 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -6075,7 +6075,7 @@ static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); loff_t size; unsigned long len; int err; @@ -6119,19 +6119,18 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) goto out_ret; } - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { - unlock_page(page); + if (folio->mapping != mapping || folio_pos(folio) > size) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); /* * Return if we have all the buffers mapped. This avoids the need to do * journal_start/journal_stop which can block and take a long time @@ -6139,17 +6138,17 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * This cannot be done for data journalling, as we have to add the * inode to the transaction's list to writeprotect pages on commit. */ - if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), + if (folio_buffers(folio)) { + if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_for_stable_page(page); + folio_wait_stable(folio); ret = VM_FAULT_LOCKED; goto out; } } - unlock_page(page); + folio_unlock(folio); /* OK, we need to fill the hole... */ if (ext4_should_dioread_nolock(inode)) get_block = ext4_get_block_unwritten; @@ -6170,26 +6169,25 @@ retry_alloc: if (!ext4_should_journal_data(inode)) { err = block_page_mkwrite(vma, vmf, get_block); } else { - lock_page(page); + folio_lock(folio); size = i_size_read(inode); /* Page got truncated from under us? */ - if (page->mapping != mapping || page_offset(page) > size) { + if (folio->mapping != mapping || folio_pos(folio) > size) { ret = VM_FAULT_NOPAGE; goto out_error; } - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + len = folio_size(folio); + if (folio_pos(folio) + len > size) + len = size - folio_pos(folio); - err = __block_write_begin(page, 0, len, ext4_get_block); + err = __block_write_begin(&folio->page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_journal_page_buffers(handle, page, len)) + if (ext4_journal_page_buffers(handle, &folio->page, len)) goto out_error; } else { - unlock_page(page); + folio_unlock(folio); } } ext4_journal_stop(handle); @@ -6202,7 +6200,7 @@ out: sb_end_pagefault(inode->i_sb); return ret; out_error: - unlock_page(page); + folio_unlock(folio); ext4_journal_stop(handle); goto out; } -- cgit v1.2.3 From f2b229a8c6c2633c35cb7446cfabea5a6f721edc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:26 +0000 Subject: ext4: Use a folio iterator in __read_end_io() Iterate once per folio, not once per page. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-27-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/readpage.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index fed4ddb652df..6f46823fba61 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -68,18 +68,16 @@ struct bio_post_read_ctx { static void __read_end_io(struct bio *bio) { - struct page *page; - struct bio_vec *bv; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bv, bio, iter_all) { - page = bv->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; if (bio->bi_status) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); -- cgit v1.2.3 From 3060b6ef05603cf3c05b2b746f739b0169bd75f9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:27 +0000 Subject: ext4: Convert mext_page_mkuptodate() to take a folio Use a folio throughout. Does not support large folios due to an array sized for MAX_BUF_PER_PAGE, but it does remove a few calls to compound_head(). Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-28-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 0cb361f0a4fe..e509c22a21ed 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -168,25 +168,27 @@ mext_folio_double_lock(struct inode *inode1, struct inode *inode2, /* Force page buffers uptodate w/o dropping page's lock */ static int -mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) +mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize, block_start, block_end; int i, err, nr = 0, partial = 0; - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); + BUG_ON(!folio_test_locked(folio)); + BUG_ON(folio_test_writeback(folio)); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) { + create_empty_buffers(&folio->page, blocksize, 0); + head = folio_buffers(folio); + } - head = page_buffers(page); - block = (sector_t)page->index << (PAGE_SHIFT - inode->i_blkbits); + block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; @@ -200,11 +202,11 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) if (!buffer_mapped(bh)) { err = ext4_get_block(inode, block, bh, 0); if (err) { - SetPageError(page); + folio_set_error(folio); return err; } if (!buffer_mapped(bh)) { - zero_user(page, block_start, blocksize); + folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } @@ -226,7 +228,7 @@ mext_page_mkuptodate(struct page *page, unsigned from, unsigned to) } out: if (!partial) - SetPageUptodate(page); + folio_mark_uptodate(folio); return 0; } @@ -354,7 +356,7 @@ again: goto unlock_folios; } data_copy: - *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); + *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; -- cgit v1.2.3 From b23fb762785babc1d6194770c88432da037c8a64 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:28 +0000 Subject: ext4: Convert pagecache_read() to use a folio Use the folio API and support folios of arbitrary sizes. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-29-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index e4da1704438e..afe847c967a4 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -42,18 +42,16 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, loff_t pos) { while (count) { - size_t n = min_t(size_t, count, - PAGE_SIZE - offset_in_page(pos)); - struct page *page; + struct folio *folio; + size_t n; - page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, + folio = read_mapping_folio(inode->i_mapping, pos >> PAGE_SHIFT, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - - memcpy_from_page(buf, page, offset_in_page(pos), n); + if (IS_ERR(folio)) + return PTR_ERR(folio); - put_page(page); + n = memcpy_from_file_folio(buf, folio, pos, count); + folio_put(folio); buf += n; pos += n; -- cgit v1.2.3 From e9ebecf266c6657de5865a02a47c0d6b2460c526 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 24 Mar 2023 18:01:29 +0000 Subject: ext4: Use a folio in ext4_read_merkle_tree_page This is an implementation of fsverity_operations read_merkle_tree_page, so it must still return the precise page asked for, but we can use the folio API to reduce the number of conversions between folios & pages. Signed-off-by: Matthew Wilcox (Oracle) Link: https://lore.kernel.org/r/20230324180129.1220691-30-willy@infradead.org Signed-off-by: Theodore Ts'o --- fs/ext4/verity.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index afe847c967a4..3b01247066dd 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -361,21 +361,21 @@ static struct page *ext4_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT; - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (!page || !PageUptodate(page)) { + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!folio || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); - if (page) - put_page(page); + if (folio) + folio_put(folio); else if (num_ra_pages > 1) page_cache_ra_unbounded(&ractl, num_ra_pages, 0); - page = read_mapping_page(inode->i_mapping, index, NULL); + folio = read_mapping_folio(inode->i_mapping, index, NULL); } - return page; + return folio_file_page(folio, index); } static int ext4_write_merkle_tree_block(struct inode *inode, const void *buf, -- cgit v1.2.3 From bd159398a2d2234de07d310132865706964aaaa7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:32 +0200 Subject: jdb2: Don't refuse invalidation of already invalidated buffers When invalidating buffers under the partial tail page, jbd2_journal_invalidate_folio() returns -EBUSY if the buffer is part of the committing transaction as we cannot safely modify buffer state. However if the buffer is already invalidated (due to previous invalidation attempts from ext4_wait_for_tail_page_commit()), there's nothing to do and there's no point in returning -EBUSY. This fixes occasional warnings from ext4_journalled_invalidate_folio() triggered by generic/051 fstest when blocksize < pagesize. Fixes: 53e872681fed ("ext4: fix deadlock in journal_unmap_buffer()") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-1-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/jbd2/transaction.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 15de1385012e..18611241f451 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2387,6 +2387,9 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh, spin_unlock(&jh->b_state_lock); write_unlock(&journal->j_state_lock); jbd2_journal_put_journal_head(jh); + /* Already zapped buffer? Nothing to do... */ + if (!bh->b_bdev) + return 0; return -EBUSY; } /* -- cgit v1.2.3 From d84c9ebdac1e39bc7b036c0c829ee8c1956edabc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:33 +0200 Subject: ext4: Mark pages with journalled data dirty Currently pages with journalled data written by write(2) or modified by block zeroing during truncate(2) are not marked as dirty. They are dirtied only once the transaction commits. This however makes writeback code think inode has no pages to write and so ext4_writepages() is not called to make pages with journalled data persistent. Mark pages with journalled data dirty (similarly as it happens for writes through mmap) so that writeback code knows about them and ext4_writepages() can do what it needs to to the inode. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 6d628d6c0847..c5de0c04204c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1003,6 +1003,18 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, return ret; } +/* + * Helper for handling dirtying of journalled data. We also mark the folio as + * dirty so that writeback code knows about this page (and inode) contains + * dirty data. ext4_writepages() then commits appropriate transaction to + * make data stable. + */ +static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) +{ + folio_mark_dirty(bh->b_folio); + return ext4_handle_dirty_metadata(handle, NULL, bh); +} + int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { @@ -1025,7 +1037,7 @@ int do_journal_get_write_access(handle_t *handle, struct inode *inode, ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); return ret; } @@ -1272,7 +1284,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode, if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - ret = ext4_handle_dirty_metadata(handle, NULL, bh); + ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; @@ -1356,7 +1368,7 @@ static int ext4_write_end(struct file *file, /* * This is a private version of page_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need - * to call ext4_handle_dirty_metadata() instead. + * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, @@ -3740,7 +3752,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); -- cgit v1.2.3 From 265e72efa99fcc0959f8d33d346a7e0f2e3fe201 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:34 +0200 Subject: ext4: Keep pages with journalled data dirty Currently we clear page dirty bit when we checkpoint some buffers from a page with journalled data or when we perform delayed dirtying of a page in ext4_writepages(). In a quest to simplify handling of journalled data we want to keep page dirty as long as it has either buffers to checkpoint or journalled dirty data. So make sure to keep page dirty in ext4_writepages() if it still has journalled data attached to it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 1 - fs/ext4/page-io.c | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5de0c04204c..473226783d00 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2393,7 +2393,6 @@ static int mpage_journal_page_buffers(handle_t *handle, int len; ClearPageChecked(page); - clear_page_dirty_for_io(page); mpd->wbc->nr_to_write--; if (page->index == size >> PAGE_SHIFT && diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 8fe1875b0a42..2e5e94219693 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -479,9 +479,11 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * to redirty the folio and keep TOWRITE tag so that * racing WB_SYNC_ALL writeback does not skip the folio. * This happens e.g. when doing writeout for - * transaction commit. + * transaction commit or when journalled data is not + * yet committed. */ - if (buffer_dirty(bh)) { + if (buffer_dirty(bh) || + (buffer_jbd(bh) && buffer_jbddirty(bh))) { if (!folio_test_dirty(folio)) folio_redirty_for_writepage(wbc, folio); keep_towrite = true; -- cgit v1.2.3 From 5e1bdea6391d09fde424a1406a04e01b208a04d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:35 +0200 Subject: ext4: Clear dirty bit from pages without data to write With journalled data it can happen that checkpointing code will write out page contents without clearing the page dirty bit. The logic in ext4_page_nomap_can_writeout() then results in us never calling mpage_submit_page() and thus clearing the dirty bit. Drop the optimization with ext4_page_nomap_can_writeout() and just always call to mpage_submit_page(). ext4_bio_write_page() knows when to redirty the page and the additional clearing & setting of page dirty bit for ordered mode writeout is not that expensive to jump through the hoops for it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 473226783d00..bca86f8f1594 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2348,19 +2348,6 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode) MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } -/* Return true if the folio needs to be written as part of transaction commit */ -static bool ext4_folio_nomap_can_writeout(struct folio *folio) -{ - struct buffer_head *bh, *head; - - bh = head = folio_buffers(folio); - do { - if (buffer_dirty(bh) && buffer_mapped(bh) && !buffer_delay(bh)) - return true; - } while ((bh = bh->b_this_page) != head); - return false; -} - static int ext4_journal_page_buffers(handle_t *handle, struct page *page, int len) { @@ -2545,13 +2532,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * range operations before discarding the page cache. */ if (!mpd->can_map) { - if (ext4_folio_nomap_can_writeout(folio)) { - WARN_ON_ONCE(sb->s_writers.frozen == - SB_FREEZE_COMPLETE); - err = mpage_submit_folio(mpd, folio); - if (err < 0) - goto out; - } + WARN_ON_ONCE(sb->s_writers.frozen == + SB_FREEZE_COMPLETE); + err = mpage_submit_folio(mpd, folio); + if (err < 0) + goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { WARN_ON_ONCE(sb->s_writers.frozen >= -- cgit v1.2.3 From 1f1a55f0bf069799edd5f21a99ac1e3b10ebafee Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:36 +0200 Subject: ext4: Commit transaction before writing back pages in data=journal mode When journalling data we currently just walk over pages, journal those that are marked for delayed dirtying (only pinned pages dirtied behing our back these days) and checkpoint other dirty pages. Because some pages may be part of running transaction the result is that after filemap_write_and_wait() we are not guaranteed pages are stable on disk. Thus places that want to flush current pagecache content need to jump through hoops to make sure journalled data is not lost. This is manageable in cases completely controlled by ext4 (such as extent shifting operations or inode eviction) but it gets ugly for stuff like fsverity. Furthermore it is rather error prone as people often do not realize journalled data needs special handling. So change ext4_writepages() to commit transaction with inode's data before going through the writeback loop in WB_SYNC_ALL mode. As a result filemap_write_and_wait() is now really getting pages to stable storage and makes pagecache pages safe to reclaim. Consequently we can remove the special handling of journalled data from several places in follow up patches. Note that this will make fsync(2) for journalled data more expensive as we will end up not only committing the transaction we need but also checkpointing the data (which we may have previously skipped if the data was part of the running transaction). If we really cared, we would need to introduce special VFS function for writing out & invalidating page cache for a range, use ->launder_page callback to perform checkpointing, and use it from all the places that need this functionality. But at this point I'm not convinced the complexity is worth it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bca86f8f1594..fb1b729ed1f8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1568,6 +1568,7 @@ struct mpage_da_data { struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; + unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -2545,6 +2546,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd, &folio->page); if (err < 0) goto out; + mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { @@ -2634,10 +2636,23 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) /* * data=journal mode does not do delalloc so we just need to writeout / - * journal already mapped buffers + * journal already mapped buffers. On the other hand we need to commit + * transaction to make data stable. We expect all the data to be + * already in the journal (the only exception are DMA pinned pages + * dirtied behind our back) so we commit transaction here and run the + * writeback loop to checkpoint them. The checkpointing is not actually + * necessary to make data persistent *but* quite a few places (extent + * shifting operations, fsverity, ...) depend on being able to drop + * pagecache pages after calling filemap_write_and_wait() and for that + * checkpointing needs to happen. */ - if (ext4_should_journal_data(inode)) + if (ext4_should_journal_data(inode)) { mpd->can_map = 0; + if (wbc->sync_mode == WB_SYNC_ALL) + ext4_fc_commit(sbi->s_journal, + EXT4_I(inode)->i_datasync_tid); + } + mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* @@ -2818,6 +2833,13 @@ static int ext4_writepages(struct address_space *mapping, percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); ret = ext4_do_writepages(&mpd); + /* + * For data=journal writeback we could have come across pages marked + * for delayed dirtying (PageChecked) which were just added to the + * running transaction. Try once more to get them to stable storage. + */ + if (!ret && mpd.journalled_more_data) + ret = ext4_do_writepages(&mpd); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); return ret; -- cgit v1.2.3 From e360c6ed727401ad1a3b5080f06d2ec3a4b75f5b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:37 +0200 Subject: ext4: Drop special handling of journalled data from ext4_sync_file() Now that ext4_writepages() make sure all pages with journalled data are stable on disk, we don't need special handling of journalled data in ext4_sync_file(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/fsync.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 027a7d7037a0..f65fdb27ce14 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -153,23 +153,12 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; /* - * data=writeback,ordered: * The caller's filemap_fdatawrite()/wait will sync the data. * Metadata is in the journal, we wait for proper transaction to * commit here. - * - * data=journal: - * filemap_fdatawrite won't do anything (the buffers are clean). - * ext4_force_commit will write the file data into the journal and - * will wait on that. - * filemap_fdatawait() will encounter a ton of newly-dirtied pages - * (they were dirtied by commit). But that's OK - the blocks are - * safe in-journal, which is all fsync() needs to ensure. */ if (!sbi->s_journal) ret = ext4_fsync_nojournal(inode, datasync, &needs_barrier); - else if (ext4_should_journal_data(inode)) - ret = ext4_force_commit(inode->i_sb); else ret = ext4_fsync_journal(inode, datasync, &needs_barrier); -- cgit v1.2.3 From c000dfec7e88cee660cbc594c9716ecc979dc1f1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:38 +0200 Subject: ext4: Drop special handling of journalled data from extent shifting operations Now that filemap_write_and_wait() makes sure pages with journalled data are safely on disk, ext4_collapse_range() and ext4_insert_range() do not need special handling of journalled data. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-7-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3559ea6b0781..0b622ae29a73 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5290,13 +5290,6 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case @@ -5443,13 +5436,6 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - /* Call ext4_force_commit to flush all data in case of data=journal */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { -- cgit v1.2.3 From 783ae448b7a21ca59ffe5bc261c17d9c3ebfe2ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:39 +0200 Subject: ext4: Fix special handling of journalled data from extent zeroing The handling of journalled data in ext4_zero_range() is incomplete. We do not need to commit running transaction but we rather need to checkpoint pages with journalled data. If we don't, journal tail can be advanced beyond transaction containing the journalled data and if we then crash before committing the transaction doing the zeroing we will have inconsistent (too old) data in the file. Make sure file pages with journalled data are properly checkpointed before removing them from the page cache. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0b622ae29a73..e79c767cc5e0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4526,13 +4526,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); - /* Call ext4_force_commit to flush all data in case of data=journal. */ - if (ext4_should_journal_data(inode)) { - ret = ext4_force_commit(inode->i_sb); - if (ret) - return ret; - } - /* * Round up offset. This is not fallocate, we need to zero out * blocks, so convert interior block aligned part of the range to @@ -4616,6 +4609,20 @@ static long ext4_zero_range(struct file *file, loff_t offset, filemap_invalidate_unlock(mapping); goto out_mutex; } + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on + * disk in case of crash before zeroing trans is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) { + filemap_invalidate_unlock(mapping); + goto out_mutex; + } + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = current_time(inode); -- cgit v1.2.3 From 56c2a0e3d90d3822fab157883957523e327bc9ae Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:40 +0200 Subject: ext4: Drop special handling of journalled data from ext4_evict_inode() Now that ext4_writepages() makes sure journalled data is on stable storage, write_inode_now() call in iput_final() is enough to make pagecache pages with journalled data really clean (data committed and checkpointed). So we can drop special handling of journalled data in ext4_evict_inode(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fb1b729ed1f8..bb8ac3e0f784 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -179,33 +179,6 @@ void ext4_evict_inode(struct inode *inode) if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { - /* - * When journalling data dirty buffers are tracked only in the - * journal. So although mm thinks everything is clean and - * ready for reaping the inode might still have some pages to - * write in the running transaction or waiting to be - * checkpointed. Thus calling jbd2_journal_invalidate_folio() - * (via truncate_inode_pages()) to discard these buffers can - * cause data loss. Also even if we did not discard these - * buffers, we would have no way to find them after the inode - * is reaped and thus user could see stale data if he tries to - * read them before the transaction is checkpointed. So be - * careful and force everything to disk here... We use - * ei->i_datasync_tid to store the newest transaction - * containing inode's data. - * - * Note that directories do not have this problem because they - * don't use page cache. - */ - if (inode->i_ino != EXT4_JOURNAL_INO && - ext4_should_journal_data(inode) && - S_ISREG(inode->i_mode) && inode->i_data.nrpages) { - journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - tid_t commit_tid = EXT4_I(inode)->i_datasync_tid; - - jbd2_complete_transaction(journal, commit_tid); - filemap_write_and_wait(&inode->i_data); - } truncate_inode_pages_final(&inode->i_data); goto no_delete; -- cgit v1.2.3 From 7c375870fdc5f50a001f8265cd8744a78d2d43ab Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:41 +0200 Subject: ext4: Drop special handling of journalled data from ext4_quota_on() Now that ext4_writepages() makes sure all journalled data is committed and checkpointed, sync_filesystem() call done by dquot_quota_on() is enough for quota IO to see uptodate data. So drop special handling of journalled data from ext4_quota_on(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-10-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 54fb600bf51f..a876ebd534a8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6881,23 +6881,6 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } - /* - * When we journal data on quota file, we have to flush journal to see - * all updates to the file when we bypass pagecache... - */ - if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(d_inode(path->dentry))) { - /* - * We don't need to lock updates but journal_flush() could - * otherwise be livelocked... - */ - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - if (err) - return err; - } - lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); if (!err) { -- cgit v1.2.3 From 951cafa6b80e55b966047b0c9cc5564df8b92145 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:42 +0200 Subject: ext4: Simplify handling of journalled data in ext4_bmap() Now that ext4_writepages() gets journalled data into its final location we just use filemap_write_and_wait() instead of special handling of journalled data in ext4_bmap(). We can also drop EXT4_STATE_JDATA flag as it is not used anymore. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-11-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/inode.c | 44 +++++--------------------------------------- 2 files changed, 5 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 83f0cc02250f..271d9661ce82 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1886,7 +1886,6 @@ static inline void ext4_simulate_fail_bh(struct super_block *sb, * Inode dynamic state flags */ enum { - EXT4_STATE_JDATA, /* journaled data exists */ EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bb8ac3e0f784..d43beb886a30 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1414,7 +1414,6 @@ static int ext4_journalled_write_end(struct file *file, } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); - ext4_set_inode_state(inode, EXT4_STATE_JDATA); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); @@ -2340,8 +2339,6 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page, ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; - ext4_set_inode_state(inode, EXT4_STATE_JDATA); - return ret; } @@ -3085,9 +3082,7 @@ int ext4_alloc_da_blocks(struct inode *inode) static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - journal_t *journal; sector_t ret = 0; - int err; inode_lock_shared(inode); /* @@ -3097,45 +3092,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && - test_opt(inode->i_sb, DELALLOC)) { + (test_opt(inode->i_sb, DELALLOC) || + ext4_should_journal_data(inode))) { /* - * With delalloc we want to sync the file - * so that we can make sure we allocate - * blocks for file + * With delalloc or journalled data we want to sync the file so + * that we can make sure we allocate blocks for file and data + * is in place for the user to see it */ filemap_write_and_wait(mapping); } - if (EXT4_JOURNAL(inode) && - ext4_test_inode_state(inode, EXT4_STATE_JDATA)) { - /* - * This is a REALLY heavyweight approach, but the use of - * bmap on dirty files is expected to be extremely rare: - * only if we run lilo or swapon on a freshly made file - * do we expect this to happen. - * - * (bmap requires CAP_SYS_RAWIO so this does not - * represent an unprivileged user DOS attack --- we'd be - * in trouble if mortal users could trigger this path at - * will.) - * - * NB. EXT4_STATE_JDATA is not set on files other than - * regular files. If somebody wants to bmap a directory - * or symlink and gets confused because the buffer - * hasn't yet been flushed to disk, they deserve - * everything they get. - */ - - ext4_clear_inode_state(inode, EXT4_STATE_JDATA); - journal = EXT4_JOURNAL(inode); - jbd2_journal_lock_updates(journal); - err = jbd2_journal_flush(journal, 0); - jbd2_journal_unlock_updates(journal); - - if (err) - goto out; - } - ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: -- cgit v1.2.3 From ab382539adcb43f52d984abf58d8e3459cd707a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:43 +0200 Subject: ext4: Update comment in mpage_prepare_extent_to_map() Since filemap_write_and_wait() is now enough to get journalled data to final location update the comment in mpage_prepare_extent_to_map(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-12-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d43beb886a30..c1888cd2a48f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2496,11 +2496,10 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This - * is crutial so that forcing a transaction commit and - * then calling filemap_write_and_wait() guarantees - * current state of data is in its final location. Such - * sequence is used for example by insert/collapse - * range operations before discarding the page cache. + * makes sure current data is checkpointed to the final + * location before possibly journalling it again which + * is desirable when the page is frequently dirtied + * through a pin. */ if (!mpd->can_map) { WARN_ON_ONCE(sb->s_writers.frozen == -- cgit v1.2.3 From d0ab8368c175f7b5ef0851283a2ba362a9ab327a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 29 Mar 2023 17:49:44 +0200 Subject: Revert "ext4: Fix warnings when freezing filesystem with journaled data" After making ext4_writepages() properly clean all pages there is no need for special treatment of filesystem freezing. Revert commit e6c28a26b799c7640b77daff3e4a67808c74381c. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230329154950.19720-13-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 15 +-------------- fs/ext4/super.c | 11 ----------- 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c1888cd2a48f..8dbd352e3986 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2385,7 +2385,6 @@ static int mpage_journal_page_buffers(handle_t *handle, static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct super_block *sb = mpd->inode->i_sb; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; @@ -2405,15 +2404,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->next_page = index; - /* - * Start a transaction for writeback of journalled data. We don't start - * the transaction if the filesystem is frozen. In that case we - * should not have any dirty data to write anymore but possibly there - * are stray page dirty bits left by the checkpointing code so this - * loop clears them. - */ - if (ext4_should_journal_data(mpd->inode) && - sb->s_writers.frozen < SB_FREEZE_FS) { + if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) @@ -2502,15 +2493,11 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) * through a pin. */ if (!mpd->can_map) { - WARN_ON_ONCE(sb->s_writers.frozen == - SB_FREEZE_COMPLETE); err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { - WARN_ON_ONCE(sb->s_writers.frozen >= - SB_FREEZE_FS); err = mpage_journal_page_buffers(handle, mpd, &folio->page); if (err < 0) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a876ebd534a8..690faf766d23 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6293,17 +6293,6 @@ static int ext4_freeze(struct super_block *sb) if (error < 0) goto out; - /* - * Do another sync. We really should not have any dirty data - * anymore but our checkpointing code does not clear page dirty - * bits due to locking constraints so writeback still can get - * started for inodes with journalled data which triggers - * annoying warnings. - */ - error = sync_filesystem(sb); - if (error < 0) - goto out; - /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); if (ext4_orphan_file_empty(sb)) -- cgit v1.2.3 From db9345d9e6f075e1ec26afadf744078ead935fec Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:10 +0800 Subject: ext4: factor out ext4_hash_info_init() Factor out ext4_hash_info_init() to simplify __ext4_fill_super(). No functional change. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-2-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 50 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 690faf766d23..13c0345c5387 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5024,6 +5024,35 @@ out: return ret; } +static void ext4_hash_info_init(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + unsigned int i; + + for (i = 0; i < 4; i++) + sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); + + sbi->s_def_hash_version = es->s_def_hash_version; + if (ext4_has_feature_dir_index(sb)) { + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + if (!sb_rdonly(sb)) + es->s_flags |= + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + } + } +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; @@ -5179,26 +5208,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); - for (i = 0; i < 4; i++) - sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); - sbi->s_def_hash_version = es->s_def_hash_version; - if (ext4_has_feature_dir_index(sb)) { - i = le32_to_cpu(es->s_flags); - if (i & EXT2_FLAGS_UNSIGNED_HASH) - sbi->s_hash_unsigned = 3; - else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { -#ifdef __CHAR_UNSIGNED__ - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); - sbi->s_hash_unsigned = 3; -#else - if (!sb_rdonly(sb)) - es->s_flags |= - cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); -#endif - } - } + ext4_hash_info_init(sb); if (ext4_handle_clustersize(sb)) goto failed_mount; -- cgit v1.2.3 From 1f79467c8a6be64940a699de1bd43338a6dd9fdd Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:11 +0800 Subject: ext4: factor out ext4_percpu_param_init() and ext4_percpu_param_destroy() Factor out ext4_percpu_param_init() and ext4_percpu_param_destroy(). And also use ext4_percpu_param_destroy() in ext4_put_super() to avoid duplicated code. No functional change. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-3-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 85 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 46 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 13c0345c5387..2421b52f1938 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1183,6 +1183,49 @@ static inline void ext4_quota_off_umount(struct super_block *sb) } #endif +static int ext4_percpu_param_init(struct ext4_sb_info *sbi) +{ + ext4_fsblk_t block; + int err; + + block = ext4_count_free_clusters(sbi->s_sb); + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); + if (!err) { + unsigned long freei = ext4_count_free_inodes(sbi->s_sb); + sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); + } + if (!err) + err = percpu_counter_init(&sbi->s_dirs_counter, + ext4_count_dirs(sbi->s_sb), GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); + if (!err) + err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, + GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_writepages_rwsem); + + if (err) + ext4_msg(sbi->s_sb, KERN_ERR, "insufficient memory"); + + return err; +} + +static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) +{ + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); + percpu_free_rwsem(&sbi->s_writepages_rwsem); +} + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1250,12 +1293,7 @@ static void ext4_put_super(struct super_block *sb) kvfree(flex_groups); } rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(get_qf_name(sb, sbi, i)); @@ -5058,7 +5096,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); struct flex_groups **flex_groups; - ext4_fsblk_t block; ext4_fsblk_t logical_sb_block; struct inode *root; int ret = -ENOMEM; @@ -5450,33 +5487,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; - block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, - EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block, - GFP_KERNEL); - if (!err) { - unsigned long freei = ext4_count_free_inodes(sb); - sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, - GFP_KERNEL); - } - if (!err) - err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb), GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, - GFP_KERNEL); - if (!err) - err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0, - GFP_KERNEL); - if (!err) - err = percpu_init_rwsem(&sbi->s_writepages_rwsem); - - if (err) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + if (ext4_percpu_param_init(sbi)) goto failed_mount6; - } if (ext4_has_feature_flex_bg(sb)) if (!ext4_fill_flex_info(sb)) { @@ -5566,12 +5578,7 @@ failed_mount6: kvfree(flex_groups); } rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit); - percpu_free_rwsem(&sbi->s_writepages_rwsem); + ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); -- cgit v1.2.3 From 6ef684988816fdfa29ceff260c97d725a489a942 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:12 +0800 Subject: ext4: use ext4_group_desc_free() in ext4_put_super() to save some duplicated code The only difference here is that ->s_group_desc and ->s_flex_groups share the same rcu read lock here but it is not necessary. In other places they do not share the lock at all. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-4-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2421b52f1938..d305413b2756 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1226,11 +1226,23 @@ static void ext4_percpu_param_destroy(struct ext4_sb_info *sbi) percpu_free_rwsem(&sbi->s_writepages_rwsem); } +static void ext4_group_desc_free(struct ext4_sb_info *sbi) +{ + struct buffer_head **group_desc; + int i; + + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); + for (i = 0; i < sbi->s_gdb_count; i++) + brelse(group_desc[i]); + kvfree(group_desc); + rcu_read_unlock(); +} + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct buffer_head **group_desc; struct flex_groups **flex_groups; int aborted = 0; int i, err; @@ -1281,11 +1293,8 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb)) ext4_commit_super(sb); + ext4_group_desc_free(sbi); rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) @@ -4757,19 +4766,6 @@ static int ext4_geometry_check(struct super_block *sb, return 0; } -static void ext4_group_desc_free(struct ext4_sb_info *sbi) -{ - struct buffer_head **group_desc; - int i; - - rcu_read_lock(); - group_desc = rcu_dereference(sbi->s_group_desc); - for (i = 0; i < sbi->s_gdb_count; i++) - brelse(group_desc[i]); - kvfree(group_desc); - rcu_read_unlock(); -} - static int ext4_group_desc_init(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t logical_sb_block, -- cgit v1.2.3 From dcbf87589d90e3bd5a5a4cf832517f22f3c55efb Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:13 +0800 Subject: ext4: factor out ext4_flex_groups_free() Factor out ext4_flex_groups_free() and it can be used both in __ext4_fill_super() and ext4_put_super(). Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-5-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d305413b2756..856f5e591455 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1239,11 +1239,25 @@ static void ext4_group_desc_free(struct ext4_sb_info *sbi) rcu_read_unlock(); } +static void ext4_flex_groups_free(struct ext4_sb_info *sbi) +{ + struct flex_groups **flex_groups; + int i; + + rcu_read_lock(); + flex_groups = rcu_dereference(sbi->s_flex_groups); + if (flex_groups) { + for (i = 0; i < sbi->s_flex_groups_allocated; i++) + kvfree(flex_groups[i]); + kvfree(flex_groups); + } + rcu_read_unlock(); +} + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - struct flex_groups **flex_groups; int aborted = 0; int i, err; @@ -1294,14 +1308,7 @@ static void ext4_put_super(struct super_block *sb) ext4_commit_super(sb); ext4_group_desc_free(sbi); - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); + ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -5091,7 +5098,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct flex_groups **flex_groups; ext4_fsblk_t logical_sb_block; struct inode *root; int ret = -ENOMEM; @@ -5566,14 +5572,7 @@ failed_mount7: ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); - rcu_read_lock(); - flex_groups = rcu_dereference(sbi->s_flex_groups); - if (flex_groups) { - for (i = 0; i < sbi->s_flex_groups_allocated; i++) - kvfree(flex_groups[i]); - kvfree(flex_groups); - } - rcu_read_unlock(); + ext4_flex_groups_free(sbi); ext4_percpu_param_destroy(sbi); failed_mount5: ext4_ext_release(sb); -- cgit v1.2.3 From 68e624398f7df3fd91d4a4cd148e722a18d76054 Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:14 +0800 Subject: ext4: rename two functions with 'check' The naming styles are different for some functions with 'check' in their names. Some of them are like: ext4_check_quota_consistency ext4_check_test_dummy_encryption ext4_check_opt_consistency ext4_check_descriptors ext4_check_feature_compatibility While the others looks like below: ext4_geometry_check ext4_journal_data_mode_check This is not a big deal and boils down to personal preference. But I'd like to make them consistent. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-6-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 856f5e591455..53e2945236b0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4713,7 +4713,7 @@ static int ext4_check_feature_compatibility(struct super_block *sb, return 0; } -static int ext4_geometry_check(struct super_block *sb, +static int ext4_check_geometry(struct super_block *sb, struct ext4_super_block *es) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -4922,7 +4922,7 @@ out: return -EINVAL; } -static int ext4_journal_data_mode_check(struct super_block *sb) +static int ext4_check_journal_data_mode(struct super_block *sb) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with " @@ -5162,7 +5162,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_encoding_init(sb, es)) goto failed_mount; - if (ext4_journal_data_mode_check(sb)) + if (ext4_check_journal_data_mode(sb)) goto failed_mount; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | @@ -5264,7 +5264,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - if (ext4_geometry_check(sb, es)) + if (ext4_check_geometry(sb, es)) goto failed_mount; timer_setup(&sbi->s_err_report, print_daily_error_info, 0); -- cgit v1.2.3 From 269e9226c29fbfe7f66a324d6d32d4a53bcffbbe Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:15 +0800 Subject: ext4: move s_reserved_gdt_blocks and addressable checking into ext4_check_geometry() These two checkings are more suitable to be put into ext4_check_geometry() rather than spreading outside. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-7-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 53e2945236b0..6e069f82934c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4718,6 +4718,25 @@ static int ext4_check_geometry(struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); __u64 blocks_count; + int err; + + if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { + ext4_msg(sb, KERN_ERR, + "Number of reserved GDT blocks insanely large: %d", + le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); + return -EINVAL; + } + /* + * Test whether we have more sectors than will fit in sector_t, + * and whether the max offset is addressable by the page cache. + */ + err = generic_check_addressable(sb->s_blocksize_bits, + ext4_blocks_count(es)); + if (err) { + ext4_msg(sb, KERN_ERR, "filesystem" + " too large to mount safely on this system"); + return err; + } /* check blocks count against device size */ blocks_count = sb_bdev_nr_blocks(sb); @@ -5174,13 +5193,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_check_feature_compatibility(sb, es, silent)) goto failed_mount; - if (le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) > (sb->s_blocksize / 4)) { - ext4_msg(sb, KERN_ERR, - "Number of reserved GDT blocks insanely large: %d", - le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks)); - goto failed_mount; - } - if (sbi->s_daxdev) { if (sb->s_blocksize == PAGE_SIZE) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); @@ -5252,18 +5264,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_handle_clustersize(sb)) goto failed_mount; - /* - * Test whether we have more sectors than will fit in sector_t, - * and whether the max offset is addressable by the page cache. - */ - err = generic_check_addressable(sb->s_blocksize_bits, - ext4_blocks_count(es)); - if (err) { - ext4_msg(sb, KERN_ERR, "filesystem" - " too large to mount safely on this system"); - goto failed_mount; - } - if (ext4_check_geometry(sb, es)) goto failed_mount; -- cgit v1.2.3 From 107d2be90116a1731d2d81296100c0a4c454a89f Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:16 +0800 Subject: ext4: factor out ext4_block_group_meta_init() Factor out ext4_block_group_meta_init(). No functional change. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-8-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 90 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 51 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6e069f82934c..f271bb53fb04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5113,6 +5113,55 @@ static void ext4_hash_info_init(struct super_block *sb) } } +static int ext4_block_group_meta_init(struct super_block *sb, int silent) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int has_huge_files; + + has_huge_files = ext4_has_feature_huge_file(sb); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); + + sbi->s_desc_size = le16_to_cpu(es->s_desc_size); + if (ext4_has_feature_64bit(sb)) { + if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || + sbi->s_desc_size > EXT4_MAX_DESC_SIZE || + !is_power_of_2(sbi->s_desc_size)) { + ext4_msg(sb, KERN_ERR, + "unsupported descriptor size %lu", + sbi->s_desc_size); + return -EINVAL; + } + } else + sbi->s_desc_size = EXT4_MIN_DESC_SIZE; + + sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); + sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); + + sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); + if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { + if (!silent) + ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + return -EINVAL; + } + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > sb->s_blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_inodes_per_group); + return -EINVAL; + } + sbi->s_itb_per_group = sbi->s_inodes_per_group / + sbi->s_inodes_per_block; + sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); + sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; + sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); + sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); + + return 0; +} + static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) { struct ext4_super_block *es = NULL; @@ -5121,7 +5170,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) struct inode *root; int ret = -ENOMEM; unsigned int i; - int needs_recovery, has_huge_files; + int needs_recovery; int err = 0; ext4_group_t first_not_zeroed; struct ext4_fs_context *ctx = fc->fs_private; @@ -5219,45 +5268,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount; } - has_huge_files = ext4_has_feature_huge_file(sb); - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, - has_huge_files); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); - - sbi->s_desc_size = le16_to_cpu(es->s_desc_size); - if (ext4_has_feature_64bit(sb)) { - if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT || - sbi->s_desc_size > EXT4_MAX_DESC_SIZE || - !is_power_of_2(sbi->s_desc_size)) { - ext4_msg(sb, KERN_ERR, - "unsupported descriptor size %lu", - sbi->s_desc_size); - goto failed_mount; - } - } else - sbi->s_desc_size = EXT4_MIN_DESC_SIZE; - - sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); - sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - - sbi->s_inodes_per_block = sb->s_blocksize / EXT4_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0 || sbi->s_blocks_per_group == 0) { - if (!silent) - ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); + if (ext4_block_group_meta_init(sb, silent)) goto failed_mount; - } - if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || - sbi->s_inodes_per_group > sb->s_blocksize * 8) { - ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", - sbi->s_inodes_per_group); - goto failed_mount; - } - sbi->s_itb_per_group = sbi->s_inodes_per_group / - sbi->s_inodes_per_block; - sbi->s_desc_per_block = sb->s_blocksize / EXT4_DESC_SIZE(sb); - sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY; - sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); - sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); ext4_hash_info_init(sb); -- cgit v1.2.3 From 54902099b1d8b62bea7cfd949aa3acd9eae1c3db Mon Sep 17 00:00:00 2001 From: Jason Yan Date: Thu, 23 Mar 2023 22:05:17 +0800 Subject: ext4: move dax and encrypt checking into ext4_check_feature_compatibility() These checkings are also related with feature compatibility checkings. So move them into ext4_check_feature_compatibility(). No functional change. Signed-off-by: Jason Yan Link: https://lore.kernel.org/r/20230323140517.1070239-9-yanaijie@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 54 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f271bb53fb04..403cc0e6cd65 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4641,6 +4641,8 @@ static int ext4_check_feature_compatibility(struct super_block *sb, struct ext4_super_block *es, int silent) { + struct ext4_sb_info *sbi = EXT4_SB(sb); + if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || ext4_has_ro_compat_features(sb) || @@ -4710,6 +4712,32 @@ static int ext4_check_feature_compatibility(struct super_block *sb, if (!ext4_feature_set_ok(sb, (sb_rdonly(sb)))) return -EINVAL; + if (sbi->s_daxdev) { + if (sb->s_blocksize == PAGE_SIZE) + set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); + else + ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); + } + + if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { + if (ext4_has_feature_inline_data(sb)) { + ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" + " that may contain inline data"); + return -EINVAL; + } + if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { + ext4_msg(sb, KERN_ERR, + "DAX unsupported by block device."); + return -EINVAL; + } + } + + if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + return -EINVAL; + } + return 0; } @@ -5242,32 +5270,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) if (ext4_check_feature_compatibility(sb, es, silent)) goto failed_mount; - if (sbi->s_daxdev) { - if (sb->s_blocksize == PAGE_SIZE) - set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); - else - ext4_msg(sb, KERN_ERR, "unsupported blocksize for DAX\n"); - } - - if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { - if (ext4_has_feature_inline_data(sb)) { - ext4_msg(sb, KERN_ERR, "Cannot use DAX on a filesystem" - " that may contain inline data"); - goto failed_mount; - } - if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) { - ext4_msg(sb, KERN_ERR, - "DAX unsupported by block device."); - goto failed_mount; - } - } - - if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) { - ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", - es->s_encryption_level); - goto failed_mount; - } - if (ext4_block_group_meta_init(sb, silent)) goto failed_mount; -- cgit v1.2.3 From 8ae56b4e82ee29b5cc1fbea8b00a0a7e0758f3c2 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Thu, 16 Mar 2023 16:48:31 -0400 Subject: ext4: remove unneeded check of nr_to_submit cppcheck reports fs/ext4/page-io.c:516:51: style: Condition 'nr_to_submit' is always true [knownConditionTrueFalse] if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { ^ This earlier check to bail, makes this check unncessary /* Nothing to submit? Just unlock the page... */ if (!nr_to_submit) return 0; Signed-off-by: Tom Rix Fixes: dff4ac75eeee ("ext4: move keep_towrite handling to ext4_bio_write_page()") Reviewed-by: Eric Biggers Link: https://lore.kernel.org/r/20230316204831.2472537-1-trix@redhat.com Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 2e5e94219693..3621f29ec671 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -510,7 +510,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, * (e.g. holes) to be unnecessarily encrypted, but this is rare and * can't happen in the common case of blocksize == PAGE_SIZE. */ - if (fscrypt_inode_uses_fs_layer_crypto(inode) && nr_to_submit) { + if (fscrypt_inode_uses_fs_layer_crypto(inode)) { gfp_t gfp_flags = GFP_NOFS; unsigned int enc_bytes = round_up(len, i_blocksize(inode)); struct page *bounce_page; -- cgit v1.2.3 From 17809d3cf801374d7c23101800770ea34951f3c8 Mon Sep 17 00:00:00 2001 From: wuchi Date: Sat, 1 Apr 2023 15:53:03 +0800 Subject: ext4: remove useless conditional branch code It's ok because the code will be optimized by the compiler, just try to simple the code. Signed-off-by: wuchi Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230401075303.45206-1-wuchi.zero@gmail.com Signed-off-by: Theodore Ts'o --- fs/ext4/bitmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 87c1c8ae9298..cd725bebe69e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -74,10 +74,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, } else calculated &= 0xFFFF; - if (provided == calculated) - return 1; - - return 0; + return provided == calculated; } void ext4_block_bitmap_csum_set(struct super_block *sb, -- cgit v1.2.3 From 519fe1bae7e20fc4e7f179d50b6102b49980e85d Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sun, 2 Apr 2023 11:37:42 +0900 Subject: ext4: Add a uapi header for ext4 userspace APIs Create a uapi header include/uapi/linux/ext4.h, move the ioctls and associated data structures to the uapi header, and include it from fs/ext4/ext4.h. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/680175260970d977d16b5cc7e7606483ec99eb63.1680402881.git.josh@joshtriplett.org Signed-off-by: Theodore Ts'o --- MAINTAINERS | 1 + fs/ext4/ext4.h | 91 +----------------------------------- include/uapi/linux/ext4.h | 117 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 119 insertions(+), 90 deletions(-) create mode 100644 include/uapi/linux/ext4.h (limited to 'fs') diff --git a/MAINTAINERS b/MAINTAINERS index d8ebab595b2a..364719171ba7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7728,6 +7728,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git F: Documentation/filesystems/ext4/ F: fs/ext4/ F: include/trace/events/ext4.h +F: include/uapi/linux/ext4.h Extended Verification Module (EVM) M: Mimi Zohar diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 271d9661ce82..18cb2680dc39 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -40,6 +40,7 @@ #ifdef __KERNEL__ #include #endif +#include #include #include @@ -591,17 +592,6 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(RESERVED); } -/* Used to pass group descriptor data when online resize is done */ -struct ext4_new_group_input { - __u32 group; /* Group number for this data */ - __u64 block_bitmap; /* Absolute block number of block bitmap */ - __u64 inode_bitmap; /* Absolute block number of inode bitmap */ - __u64 inode_table; /* Absolute block number of inode table start */ - __u32 blocks_count; /* Total number of blocks in this group */ - __u16 reserved_blocks; /* Number of reserved blocks in this group */ - __u16 unused; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; @@ -698,70 +688,6 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 -/* - * ioctl commands - */ -#define EXT4_IOC_GETVERSION _IOR('f', 3, long) -#define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION -#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION -#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) -#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) -#define EXT4_IOC_MIGRATE _IO('f', 9) - /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ - /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ -#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) -#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) -#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) -#define EXT4_IOC_SWAP_BOOT _IO('f', 17) -#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) -/* ioctl codes 19--39 are reserved for fscrypt */ -#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) -#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) -#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) -#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) -#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) -#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) - -#define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32) - -/* - * Flags for going down operation - */ -#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ -#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ - -/* - * Flags returned by EXT4_IOC_GETSTATE - * - * We only expose to userspace a subset of the state flags in - * i_state_flags - */ -#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 -#define EXT4_STATE_FLAG_NEW 0x00000002 -#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 -#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 - -/* flags for ioctl EXT4_IOC_CHECKPOINT */ -#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 -#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 -#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 -#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ - EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ - EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) - -/* - * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID - */ -struct fsuuid { - __u32 fsu_len; - __u32 fsu_flags; - __u8 fsu_uuid[]; -}; - #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -776,12 +702,6 @@ struct fsuuid { #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif -/* - * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. - * It indicates that the entry in extent status cache is for a hole. - */ -#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 - /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF @@ -852,15 +772,6 @@ struct ext4_inode { __le32 i_projid; /* Project ID */ }; -struct move_extent { - __u32 reserved; /* should be zero */ - __u32 donor_fd; /* donor file descriptor */ - __u64 orig_start; /* logical start offset in block for orig */ - __u64 donor_start; /* logical start offset in block for donor */ - __u64 len; /* block length to be moved */ - __u64 moved_len; /* moved block length */ -}; - #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) diff --git a/include/uapi/linux/ext4.h b/include/uapi/linux/ext4.h new file mode 100644 index 000000000000..1c4c2dd29112 --- /dev/null +++ b/include/uapi/linux/ext4.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ + +#ifndef _UAPI_LINUX_EXT4_H +#define _UAPI_LINUX_EXT4_H +#include +#include +#include +#include + +/* + * ext4-specific ioctl commands + */ +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */ + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ +#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12) +#define EXT4_IOC_MOVE_EXT _IOWR('f', 15, struct move_extent) +#define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) +#define EXT4_IOC_SWAP_BOOT _IO('f', 17) +#define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +/* ioctl codes 19--39 are reserved for fscrypt */ +#define EXT4_IOC_CLEAR_ES_CACHE _IO('f', 40) +#define EXT4_IOC_GETSTATE _IOW('f', 41, __u32) +#define EXT4_IOC_GET_ES_CACHE _IOWR('f', 42, struct fiemap) +#define EXT4_IOC_CHECKPOINT _IOW('f', 43, __u32) +#define EXT4_IOC_GETFSUUID _IOR('f', 44, struct fsuuid) +#define EXT4_IOC_SETFSUUID _IOW('f', 44, struct fsuuid) + +#define EXT4_IOC_SHUTDOWN _IOR('X', 125, __u32) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + +/* + * Flags returned by EXT4_IOC_GETSTATE + * + * We only expose to userspace a subset of the state flags in + * i_state_flags + */ +#define EXT4_STATE_FLAG_EXT_PRECACHED 0x00000001 +#define EXT4_STATE_FLAG_NEW 0x00000002 +#define EXT4_STATE_FLAG_NEWENTRY 0x00000004 +#define EXT4_STATE_FLAG_DA_ALLOC_CLOSE 0x00000008 + +/* + * Flags for ioctl EXT4_IOC_CHECKPOINT + */ +#define EXT4_IOC_CHECKPOINT_FLAG_DISCARD 0x1 +#define EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT 0x2 +#define EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN 0x4 +#define EXT4_IOC_CHECKPOINT_FLAG_VALID (EXT4_IOC_CHECKPOINT_FLAG_DISCARD | \ + EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT | \ + EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) + +/* + * Structure for EXT4_IOC_GETFSUUID/EXT4_IOC_SETFSUUID + */ +struct fsuuid { + __u32 fsu_len; + __u32 fsu_flags; + __u8 fsu_uuid[]; +}; + +/* + * Structure for EXT4_IOC_MOVE_EXT + */ +struct move_extent { + __u32 reserved; /* should be zero */ + __u32 donor_fd; /* donor file descriptor */ + __u64 orig_start; /* logical start offset in block for orig */ + __u64 donor_start; /* logical start offset in block for donor */ + __u64 len; /* block length to be moved */ + __u64 moved_len; /* moved block length */ +}; + +/* + * Flags used by EXT4_IOC_SHUTDOWN + */ +#define EXT4_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* + * Returned by EXT4_IOC_GET_ES_CACHE as an additional possible flag. + * It indicates that the entry in extent status cache is for a hole. + */ +#define EXT4_FIEMAP_EXTENT_HOLE 0x08000000 + +#endif /* _UAPI_LINUX_EXT4_H */ -- cgit v1.2.3