From 432acd550e3607d5fea23e27f6ab4e4567deccfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:26:59 +0100 Subject: iomap: move the io_folios field out of struct iomap_ioend The io_folios member in struct iomap_ioend counts the number of folios added to an ioend. It is only used at submission time and can thus be moved to iomap_writepage_ctx instead. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-4-hch@lst.de Reviewed-by: Ritesh Harjani (IBM) Signed-off-by: Christian Brauner --- include/linux/iomap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 96dd0acbba44..b2a05dff914d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -293,7 +293,6 @@ struct iomap_ioend { struct list_head io_list; /* next ioend in chain */ u16 io_type; u16 io_flags; /* IOMAP_F_* */ - u32 io_folios; /* folios added to ioend */ struct inode *io_inode; /* file being written to */ size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ @@ -329,6 +328,7 @@ struct iomap_writepage_ctx { struct iomap iomap; struct iomap_ioend *ioend; const struct iomap_writeback_ops *ops; + u32 nr_folios; /* folios added to the ioend */ }; void iomap_finish_ioends(struct iomap_ioend *ioend, int error); -- cgit v1.2.3 From ae5535efd8c445ad6033ac0d5da0197897b148ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:05 +0100 Subject: iomap: don't chain bios Back in the days when a single bio could only be filled to the hardware limits, and we scheduled a work item for each bio completion, chaining multiple bios for a single ioend made a lot of sense to reduce the number of completions. But these days bios can be filled until we reach the number of vectors or total size limit, which means we can always fit at least 1 megabyte worth of data in the worst case, but usually a lot more due to large folios. The only thing bio chaining is buying us now is to reduce the size of the allocation from an ioend with an embedded bio into a plain bio, which is a 52 bytes differences on 64-bit systems. This is not worth the added complexity, so remove the bio chaining and only use the bio embedded into the ioend. This will help to simplify further changes to the iomap writeback code. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-10-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 90 +++++++++++++------------------------------------- fs/xfs/xfs_aops.c | 6 ++-- include/linux/iomap.h | 8 +++-- 3 files changed, 32 insertions(+), 72 deletions(-) (limited to 'include') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3a3f3ebc070c..5f6affbe7056 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1479,40 +1479,23 @@ static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) { struct inode *inode = ioend->io_inode; - struct bio *bio = &ioend->io_inline_bio; - struct bio *last = ioend->io_bio, *next; - u64 start = bio->bi_iter.bi_sector; - loff_t offset = ioend->io_offset; - bool quiet = bio_flagged(bio, BIO_QUIET); + struct bio *bio = &ioend->io_bio; + struct folio_iter fi; u32 folio_count = 0; - for (bio = &ioend->io_inline_bio; bio; bio = next) { - struct folio_iter fi; - - /* - * For the last bio, bi_private points to the ioend, so we - * need to explicitly end the iteration here. - */ - if (bio == last) - next = NULL; - else - next = bio->bi_private; - - /* walk all folios in bio, ending page IO on them */ - bio_for_each_folio_all(fi, bio) { - iomap_finish_folio_write(inode, fi.folio, fi.length, - error); - folio_count++; - } - bio_put(bio); + /* walk all folios in bio, ending page IO on them */ + bio_for_each_folio_all(fi, bio) { + iomap_finish_folio_write(inode, fi.folio, fi.length, error); + folio_count++; } - /* The ioend has been freed by bio_put() */ - if (unlikely(error && !quiet)) { + if (unlikely(error && !bio_flagged(bio, BIO_QUIET))) { printk_ratelimited(KERN_ERR "%s: writeback error on inode %lu, offset %lld, sector %llu", - inode->i_sb->s_id, inode->i_ino, offset, start); + inode->i_sb->s_id, inode->i_ino, + ioend->io_offset, ioend->io_sector); } + bio_put(bio); /* frees the ioend */ return folio_count; } @@ -1553,7 +1536,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends); static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) { - if (ioend->io_bio->bi_status != next->io_bio->bi_status) + if (ioend->io_bio.bi_status != next->io_bio.bi_status) return false; if ((ioend->io_flags & IOMAP_F_SHARED) ^ (next->io_flags & IOMAP_F_SHARED)) @@ -1618,9 +1601,8 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends); static void iomap_writepage_end_bio(struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; - - iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status)); + iomap_finish_ioend(iomap_ioend_from_bio(bio), + blk_status_to_errno(bio->bi_status)); } /* @@ -1635,9 +1617,6 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, int error) { - ioend->io_bio->bi_private = ioend; - ioend->io_bio->bi_end_io = iomap_writepage_end_bio; - if (wpc->ops->prepare_ioend) error = wpc->ops->prepare_ioend(ioend, error); if (error) { @@ -1647,12 +1626,12 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, * as there is only one reference to the ioend at this point in * time. */ - ioend->io_bio->bi_status = errno_to_blk_status(error); - bio_endio(ioend->io_bio); + ioend->io_bio.bi_status = errno_to_blk_status(error); + bio_endio(&ioend->io_bio); return error; } - submit_bio(ioend->io_bio); + submit_bio(&ioend->io_bio); return 0; } @@ -1666,44 +1645,22 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); + bio->bi_end_io = iomap_writepage_end_bio; wbc_init_bio(wbc, bio); - ioend = container_of(bio, struct iomap_ioend, io_inline_bio); + ioend = iomap_ioend_from_bio(bio); INIT_LIST_HEAD(&ioend->io_list); ioend->io_type = wpc->iomap.type; ioend->io_flags = wpc->iomap.flags; ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = pos; - ioend->io_bio = bio; ioend->io_sector = bio->bi_iter.bi_sector; wpc->nr_folios = 0; return ioend; } -/* - * Allocate a new bio, and chain the old bio to the new one. - * - * Note that we have to perform the chaining in this unintuitive order - * so that the bi_private linkage is set up in the right direction for the - * traversal in iomap_finish_ioend(). - */ -static struct bio * -iomap_chain_bio(struct bio *prev) -{ - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); - bio_clone_blkg_association(new, prev); - new->bi_iter.bi_sector = bio_end_sector(prev); - - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); - return new; -} - static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) { if ((wpc->iomap.flags & IOMAP_F_SHARED) != @@ -1714,7 +1671,7 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; if (iomap_sector(&wpc->iomap, pos) != - bio_end_sector(wpc->ioend->io_bio)) + bio_end_sector(&wpc->ioend->io_bio)) return false; /* * Limit ioend bio chain lengths to minimise IO completion latency. This @@ -1739,15 +1696,14 @@ static void iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, size_t poff = offset_in_folio(folio, pos); if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { +new_ioend: if (wpc->ioend) list_add(&wpc->ioend->io_list, iolist); wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); } - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } + if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) + goto new_ioend; if (ifs) atomic_add(len, &ifs->write_bytes_pending); @@ -1978,7 +1934,7 @@ EXPORT_SYMBOL_GPL(iomap_writepages); static int __init iomap_init(void) { return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_inline_bio), + offsetof(struct iomap_ioend, io_bio), BIOSET_NEED_BVECS); } fs_initcall(iomap_init); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 813f85156b0c..4fb244bb884d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -112,7 +112,7 @@ xfs_end_ioend( * longer dirty. If we don't remove delalloc blocks here, they become * stale and can corrupt free space accounting on unmount. */ - error = blk_status_to_errno(ioend->io_bio->bi_status); + error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); @@ -179,7 +179,7 @@ STATIC void xfs_end_bio( struct bio *bio) { - struct iomap_ioend *ioend = bio->bi_private; + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); unsigned long flags; @@ -444,7 +444,7 @@ xfs_prepare_ioend( /* send ioends that might require a transaction to the completion wq */ if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || (ioend->io_flags & IOMAP_F_SHARED)) - ioend->io_bio->bi_end_io = xfs_end_bio; + ioend->io_bio.bi_end_io = xfs_end_bio; return status; } diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b2a05dff914d..b8d3b658ad2b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -297,10 +297,14 @@ struct iomap_ioend { size_t io_size; /* size of the extent */ loff_t io_offset; /* offset in the file */ sector_t io_sector; /* start sector of ioend */ - struct bio *io_bio; /* bio being built */ - struct bio io_inline_bio; /* MUST BE LAST! */ + struct bio io_bio; /* MUST BE LAST! */ }; +static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio) +{ + return container_of(bio, struct iomap_ioend, io_bio); +} + struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on -- cgit v1.2.3 From 30deff8531f469453ccc0981f14eceb0a2ea68d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:09 +0100 Subject: iomap: map multiple blocks at a time The ->map_blocks interface returns a valid range for writeback, but we still call back into it for every block, which is a bit inefficient. Change iomap_writepage_map to use the valid range in the map until the end of the folio or the dirty range inside the folio instead of calling back into every block. Note that the range is not used over folio boundaries as we need to be able to check the mapping sequence count under the folio lock. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-14-hch@lst.de Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 116 ++++++++++++++++++++++++++++++++++--------------- include/linux/iomap.h | 7 +++ 2 files changed, 88 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 17d46580cec8..3dab060aed6d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (C) 2016-2019 Christoph Hellwig. + * Copyright (C) 2016-2023 Christoph Hellwig. */ #include #include @@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio, return test_bit(block + blks_per_folio, ifs->state); } +static unsigned ifs_find_dirty_range(struct folio *folio, + struct iomap_folio_state *ifs, u64 *range_start, u64 range_end) +{ + struct inode *inode = folio->mapping->host; + unsigned start_blk = + offset_in_folio(folio, *range_start) >> inode->i_blkbits; + unsigned end_blk = min_not_zero( + offset_in_folio(folio, range_end) >> inode->i_blkbits, + i_blocks_per_folio(inode, folio)); + unsigned nblks = 1; + + while (!ifs_block_is_dirty(folio, ifs, start_blk)) + if (++start_blk == end_blk) + return 0; + + while (start_blk + nblks < end_blk) { + if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks)) + break; + nblks++; + } + + *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits); + return nblks << inode->i_blkbits; +} + +static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start, + u64 range_end) +{ + struct iomap_folio_state *ifs = folio->private; + + if (*range_start >= range_end) + return 0; + + if (ifs) + return ifs_find_dirty_range(folio, ifs, range_start, range_end); + return range_end - *range_start; +} + static void ifs_clear_range_dirty(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { @@ -1701,10 +1739,9 @@ static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) */ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, loff_t pos) + struct inode *inode, loff_t pos, unsigned len) { struct iomap_folio_state *ifs = folio->private; - unsigned len = i_blocksize(inode); size_t poff = offset_in_folio(folio, pos); int error; @@ -1728,29 +1765,41 @@ new_ioend: static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct folio *folio, - struct inode *inode, u64 pos, unsigned *count) + struct inode *inode, u64 pos, unsigned dirty_len, + unsigned *count) { int error; - error = wpc->ops->map_blocks(wpc, inode, pos); - if (error) - goto fail; - trace_iomap_writepage_map(inode, &wpc->iomap); - - switch (wpc->iomap.type) { - case IOMAP_INLINE: - WARN_ON_ONCE(1); - error = -EIO; - break; - case IOMAP_HOLE: - break; - default: - error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos); - if (!error) - (*count)++; - } + do { + unsigned map_len; + + error = wpc->ops->map_blocks(wpc, inode, pos); + if (error) + break; + trace_iomap_writepage_map(inode, &wpc->iomap); + + map_len = min_t(u64, dirty_len, + wpc->iomap.offset + wpc->iomap.length - pos); + WARN_ON_ONCE(!folio->private && map_len < dirty_len); + + switch (wpc->iomap.type) { + case IOMAP_INLINE: + WARN_ON_ONCE(1); + error = -EIO; + break; + case IOMAP_HOLE: + break; + default: + error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos, + map_len); + if (!error) + (*count)++; + break; + } + dirty_len -= map_len; + pos += map_len; + } while (dirty_len && !error); -fail: /* * We cannot cancel the ioend directly here on error. We may have * already set other pages under writeback and hence we have to run I/O @@ -1817,7 +1866,7 @@ static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode, * beyond i_size. */ folio_zero_segment(folio, poff, folio_size(folio)); - *end_pos = isize; + *end_pos = round_up(isize, i_blocksize(inode)); } return true; @@ -1828,12 +1877,11 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; - unsigned len = i_blocksize(inode); - unsigned nblocks = i_blocks_per_folio(inode, folio); u64 pos = folio_pos(folio); u64 end_pos = pos + folio_size(folio); unsigned count = 0; - int error = 0, i; + int error = 0; + u32 rlen; WARN_ON_ONCE(!folio_test_locked(folio)); WARN_ON_ONCE(folio_test_dirty(folio)); @@ -1847,7 +1895,7 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, } WARN_ON_ONCE(end_pos <= pos); - if (nblocks > 1) { + if (i_blocks_per_folio(inode, folio) > 1) { if (!ifs) { ifs = ifs_alloc(inode, folio, 0); iomap_set_range_dirty(folio, 0, end_pos - pos); @@ -1870,18 +1918,16 @@ static int iomap_writepage_map(struct iomap_writepage_ctx *wpc, folio_start_writeback(folio); /* - * Walk through the folio to find areas to write back. If we - * run off the end of the current map or find the current map - * invalid, grab a new one. + * Walk through the folio to find dirty areas to write back. */ - for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (ifs && !ifs_block_is_dirty(folio, ifs, i)) - continue; - error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, pos, - &count); + while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) { + error = iomap_writepage_map_blocks(wpc, wbc, folio, inode, + pos, rlen, &count); if (error) break; + pos += rlen; } + if (count) wpc->nr_folios++; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index b8d3b658ad2b..49d93f538785 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -309,6 +309,13 @@ struct iomap_writeback_ops { /* * Required, maps the blocks so that writeback can be performed on * the range starting at offset. + * + * Can return arbitrarily large regions, but we need to call into it at + * least once per folio to allow the file systems to synchronize with + * the write path that could be invalidating mappings. + * + * An existing mapping from a previous call to this method can be reused + * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, loff_t offset); -- cgit v1.2.3 From 19871b5c7a003946d3cd4209a348ab7c0df5dbad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Dec 2023 08:27:10 +0100 Subject: iomap: pass the length of the dirty region to ->map_blocks Let the file system know how much dirty data exists at the passed in offset. This allows file systems to allocate the right amount of space that actually is written back if they can't eagerly convert (e.g. because they don't support unwritten extents). Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231207072710.176093-15-hch@lst.de Signed-off-by: Christian Brauner --- block/fops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/iomap/buffered-io.c | 2 +- fs/xfs/xfs_aops.c | 3 ++- fs/zonefs/file.c | 3 ++- include/linux/iomap.h | 2 +- 6 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..93bae17ce660 100644 --- a/block/fops.c +++ b/block/fops.c @@ -482,7 +482,7 @@ static void blkdev_readahead(struct readahead_control *rac) } static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, unsigned int len) { loff_t isize = i_size_read(inode); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index d9ccfd27e4f1..789af5c8fade 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -2465,7 +2465,7 @@ out: } static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, unsigned int len) { int ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 3dab060aed6d..2ad0e287c704 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1773,7 +1773,7 @@ static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc, do { unsigned map_len; - error = wpc->ops->map_blocks(wpc, inode, pos); + error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len); if (error) break; trace_iomap_writepage_map(inode, &wpc->iomap); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4fb244bb884d..1698507d1ac7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -276,7 +276,8 @@ static int xfs_map_blocks( struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset) + loff_t offset, + unsigned int len) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 6ab2318a9c8e..8dab4c2ad300 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac) * which implies that the page range can only be within the fixed inode size. */ static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) + struct inode *inode, loff_t offset, + unsigned int len) { struct zonefs_zone *z = zonefs_inode_zone(inode); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 49d93f538785..6fc1c858013d 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -318,7 +318,7 @@ struct iomap_writeback_ops { * by the file system if it is still valid. */ int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode, - loff_t offset); + loff_t offset, unsigned len); /* * Optional, allows the file systems to perform actions just before -- cgit v1.2.3 From fe3944fb245ab99570552a3bf970b00058a9ca6d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:23 -0800 Subject: fs: Move enum rw_hint into a new header file Move enum rw_hint into a new header file to prepare for using this data type in the block layer. Add the attribute __packed to reduce the space occupied by instances of this data type from four bytes to one byte. Change the data type of i_write_hint from u8 into enum rw_hint. Reviewed-by: Christoph Hellwig Acked-by: Chao Yu # for the F2FS part Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-5-bvanassche@acm.org Signed-off-by: Christian Brauner --- fs/f2fs/f2fs.h | 1 + fs/fcntl.c | 1 + fs/inode.c | 1 + include/linux/fs.h | 16 ++-------------- include/linux/rw_hint.h | 24 ++++++++++++++++++++++++ 5 files changed, 29 insertions(+), 14 deletions(-) create mode 100644 include/linux/rw_hint.h (limited to 'include') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 65294e3b0bef..01fde6d44bf6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/fs/fcntl.c b/fs/fcntl.c index d2b15351ab8e..00be0a710bba 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/fs/inode.c b/fs/inode.c index 91048c4c9c9e..1aba6c0bf26a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/include/linux/fs.h b/include/linux/fs.h index ed5966a70495..bdabda5dc364 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -309,19 +310,6 @@ struct address_space; struct writeback_control; struct readahead_control; -/* - * Write life time hint values. - * Stored in struct inode as u8. - */ -enum rw_hint { - WRITE_LIFE_NOT_SET = 0, - WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, - WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, - WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, - WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, - WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, -}; - /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC @@ -677,7 +665,7 @@ struct inode { spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; - u8 i_write_hint; + enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h new file mode 100644 index 000000000000..309ca72f2dfb --- /dev/null +++ b/include/linux/rw_hint.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_RW_HINT_H +#define _LINUX_RW_HINT_H + +#include +#include +#include + +/* Block storage write lifetime hint values. */ +enum rw_hint { + WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET, + WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, + WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, + WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, + WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, + WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, +} __packed; + +/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */ +#ifndef __CHECKER__ +static_assert(sizeof(enum rw_hint) == 1); +#endif + +#endif /* _LINUX_RW_HINT_H */ -- cgit v1.2.3 From 449813515d3e5efec85206bb91588a6249a421a3 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 2 Feb 2024 12:39:25 -0800 Subject: block, fs: Restore the per-bio/request data lifetime fields Restore support for passing data lifetime information from filesystems to block drivers. This patch reverts commit b179c98f7697 ("block: Remove request.write_hint") and commit c75e707fe1aa ("block: remove the per-bio/request write hint"). This patch does not modify the size of struct bio because the new bi_write_hint member fills a hole in struct bio. pahole reports the following for struct bio on an x86_64 system with this patch applied: /* size: 112, cachelines: 2, members: 20 */ /* sum members: 110, holes: 1, sum holes: 2 */ /* last cacheline: 48 bytes */ Reviewed-by: Kanchan Joshi Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20240202203926.2478590-7-bvanassche@acm.org Signed-off-by: Christian Brauner --- block/bio.c | 2 ++ block/blk-crypto-fallback.c | 1 + block/blk-merge.c | 8 ++++++++ block/blk-mq.c | 2 ++ block/bounce.c | 1 + block/fops.c | 3 +++ fs/buffer.c | 12 ++++++++---- fs/direct-io.c | 2 ++ fs/iomap/buffered-io.c | 1 + fs/iomap/direct-io.c | 1 + fs/mpage.c | 1 + include/linux/blk-mq.h | 2 ++ include/linux/blk_types.h | 2 ++ 13 files changed, 34 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/block/bio.c b/block/bio.c index b9642a41f286..c9223e9d31da 100644 --- a/block/bio.c +++ b/block/bio.c @@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, bio->bi_opf = opf; bio->bi_flags = 0; bio->bi_ioprio = 0; + bio->bi_write_hint = 0; bio->bi_status = 0; bio->bi_iter.bi_sector = 0; bio->bi_iter.bi_size = 0; @@ -813,6 +814,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { bio_set_flag(bio, BIO_CLONED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; if (bio->bi_bdev) { diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index e6468eab2681..b1e7415f8439 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/blk-merge.c b/block/blk-merge.c index 2d470cf2173e..2a06fd33039d 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q, if (rq_data_dir(req) != rq_data_dir(next)) return NULL; + /* Don't merge requests with different write hints. */ + if (req->write_hint != next->write_hint) + return NULL; + if (req->ioprio != next->ioprio) return NULL; @@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; + /* Don't merge requests with different write hints. */ + if (rq->write_hint != bio->bi_write_hint) + return false; + if (rq->ioprio != bio_prio(bio)) return false; diff --git a/block/blk-mq.c b/block/blk-mq.c index aa87fcfda1ec..34ceb15d2ea4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2585,6 +2585,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ @@ -3185,6 +3186,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; + rq->write_hint = rq_src->write_hint; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; diff --git a/block/bounce.c b/block/bounce.c index 7cfcb242f9a1..d6a5219f29dd 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src) if (bio_flagged(bio_src, BIO_REMAPPED)) bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; diff --git a/block/fops.c b/block/fops.c index 0cf8cf72cdfa..ab0e37d1dc48 100644 --- a/block/fops.c +++ b/block/fops.c @@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); } bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); @@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, for (;;) { bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; @@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, dio->flags = 0; dio->iocb = iocb; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint; bio->bi_end_io = blkdev_bio_end_io_async; bio->bi_ioprio = iocb->ki_ioprio; diff --git a/fs/buffer.c b/fs/buffer.c index d3bcf601d3e5..eb7d3ded2c33 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -55,7 +55,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, - struct writeback_control *wbc); + enum rw_hint hint, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -1944,7 +1945,8 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc); + submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, + inode->i_write_hint, wbc); nr_underway++; } bh = next; @@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio) } static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, + enum rw_hint write_hint, struct writeback_control *wbc) { const enum req_op op = opf & REQ_OP_MASK; @@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_write_hint = write_hint; __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh, void submit_bh(blk_opf_t opf, struct buffer_head *bh) { - submit_bh_wbc(opf, bh, NULL); + submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/direct-io.c b/fs/direct-io.c index 60456263a338..62c97ff9e852 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, bio->bi_end_io = dio_bio_end_io; if (dio->is_pinned) bio_set_flag(bio, BIO_PAGE_PINNED); + bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint; + sdio->bio = bio; sdio->logical_offset_in_bio = sdio->cur_page_fs_offset; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 093c4515b22a..18e1fef53fbc 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1667,6 +1667,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, REQ_OP_WRITE | wbc_to_write_flags(wbc), GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = sector; + bio->bi_write_hint = inode->i_write_hint; wbc_init_bio(wbc, bio); ioend = container_of(bio, struct iomap_ioend, io_inline_bio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bcd3f8cf5ea4..f3b43d223a46 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, GFP_KERNEL); bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio->bi_write_hint = inode->i_write_hint; bio->bi_ioprio = dio->iocb->ki_ioprio; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; diff --git a/fs/mpage.c b/fs/mpage.c index 738882e0766d..fa8b99a199fa 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,7 @@ alloc_new: GFP_NOFS); bio->bi_iter.bi_sector = first_block << (blkbits - 9); wbc_init_bio(wbc, bio); + bio->bi_write_hint = inode->i_write_hint; } /* diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..492b0128b5d9 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -135,6 +136,7 @@ struct request { struct blk_crypto_keyslot *crypt_keyslot; #endif + enum rw_hint write_hint; unsigned short ioprio; enum mq_rq_state state; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index f288c94374b3..12d87cef2c03 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -10,6 +10,7 @@ #include #include #include +#include struct bio_set; struct bio; @@ -269,6 +270,7 @@ struct bio { */ unsigned short bi_flags; /* BIO_* below */ unsigned short bi_ioprio; + enum rw_hint bi_write_hint; blk_status_t bi_status; atomic_t __bi_remaining; -- cgit v1.2.3