summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--block/bio.c2
-rw-r--r--block/blk-crypto-fallback.c1
-rw-r--r--block/blk-merge.c8
-rw-r--r--block/blk-mq.c2
-rw-r--r--block/bounce.c1
-rw-r--r--block/fops.c5
-rw-r--r--fs/buffer.c12
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/f2fs/f2fs.h1
-rw-r--r--fs/fcntl.c64
-rw-r--r--fs/gfs2/bmap.c2
-rw-r--r--fs/inode.c1
-rw-r--r--fs/iomap/buffered-io.c579
-rw-r--r--fs/iomap/direct-io.c1
-rw-r--r--fs/iomap/trace.h48
-rw-r--r--fs/mpage.c1
-rw-r--r--fs/xfs/xfs_aops.c9
-rw-r--r--fs/zonefs/file.c3
-rw-r--r--include/linux/blk-mq.h2
-rw-r--r--include/linux/blk_types.h2
-rw-r--r--include/linux/fs.h16
-rw-r--r--include/linux/iomap.h19
-rw-r--r--include/linux/rw_hint.h24
23 files changed, 455 insertions, 350 deletions
diff --git a/block/bio.c b/block/bio.c
index b9642a41f286..c9223e9d31da 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -251,6 +251,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
bio->bi_opf = opf;
bio->bi_flags = 0;
bio->bi_ioprio = 0;
+ bio->bi_write_hint = 0;
bio->bi_status = 0;
bio->bi_iter.bi_sector = 0;
bio->bi_iter.bi_size = 0;
@@ -813,6 +814,7 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
bio_set_flag(bio, BIO_CLONED);
bio->bi_ioprio = bio_src->bi_ioprio;
+ bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
if (bio->bi_bdev) {
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index e6468eab2681..b1e7415f8439 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -172,6 +172,7 @@ static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
+ bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2d470cf2173e..2a06fd33039d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -810,6 +810,10 @@ static struct request *attempt_merge(struct request_queue *q,
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
+ /* Don't merge requests with different write hints. */
+ if (req->write_hint != next->write_hint)
+ return NULL;
+
if (req->ioprio != next->ioprio)
return NULL;
@@ -937,6 +941,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
+ /* Don't merge requests with different write hints. */
+ if (rq->write_hint != bio->bi_write_hint)
+ return false;
+
if (rq->ioprio != bio_prio(bio))
return false;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2dc01551e27c..9c475a89e3af 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2584,6 +2584,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
+ rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
/* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
@@ -3175,6 +3176,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
}
rq->nr_phys_segments = rq_src->nr_phys_segments;
rq->ioprio = rq_src->ioprio;
+ rq->write_hint = rq_src->write_hint;
if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
goto free_and_out;
diff --git a/block/bounce.c b/block/bounce.c
index 7cfcb242f9a1..d6a5219f29dd 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -169,6 +169,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src)
if (bio_flagged(bio_src, BIO_REMAPPED))
bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
+ bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
diff --git a/block/fops.c b/block/fops.c
index 0cf8cf72cdfa..63566a3db186 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -73,6 +73,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
}
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio.bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio.bi_ioprio = iocb->ki_ioprio;
ret = bio_iov_iter_get_pages(&bio, iter);
@@ -203,6 +204,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
for (;;) {
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -321,6 +323,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
dio->flags = 0;
dio->iocb = iocb;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_write_hint = file_inode(iocb->ki_filp)->i_write_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
@@ -482,7 +485,7 @@ static void blkdev_readahead(struct readahead_control *rac)
}
static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+ struct inode *inode, loff_t offset, unsigned int len)
{
loff_t isize = i_size_read(inode);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9a54077de87d..4f73d23c2c46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -55,7 +55,7 @@
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
- struct writeback_control *wbc);
+ enum rw_hint hint, struct writeback_control *wbc);
#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
do {
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -1944,7 +1945,8 @@ recover:
struct buffer_head *next = bh->b_this_page;
if (buffer_async_write(bh)) {
clear_buffer_dirty(bh);
- submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+ submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+ inode->i_write_hint, wbc);
nr_underway++;
}
bh = next;
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
}
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+ enum rw_hint write_hint,
struct writeback_control *wbc)
{
const enum req_op op = opf & REQ_OP_MASK;
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+ bio->bi_write_hint = write_hint;
__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
- submit_bh_wbc(opf, bh, NULL);
+ submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 60456263a338..62c97ff9e852 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
bio->bi_end_io = dio_bio_end_io;
if (dio->is_pinned)
bio_set_flag(bio, BIO_PAGE_PINNED);
+ bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
+
sdio->bio = bio;
sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
}
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index dda0aed95464..4a4e60cdac4e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
#include <linux/blkdev.h>
#include <linux/quotaops.h>
#include <linux/part_stat.h>
+#include <linux/rw_hint.h>
#include <crypto/hash.h>
#include <linux/fscrypt.h>
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c3e342eb74af..54cc85d3338e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
+#include <linux/rw_hint.h>
#include <linux/poll.h>
#include <asm/siginfo.h>
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
}
#endif
-static bool rw_hint_valid(enum rw_hint hint)
+static bool rw_hint_valid(u64 hint)
{
+ BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+ BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+ BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+ BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+ BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+ BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
switch (hint) {
case RWH_WRITE_LIFE_NOT_SET:
case RWH_WRITE_LIFE_NONE:
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
}
}
-static long fcntl_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
- enum rw_hint hint;
- u64 h;
+ u64 hint = READ_ONCE(inode->i_write_hint);
- switch (cmd) {
- case F_GET_RW_HINT:
- h = inode->i_write_hint;
- if (copy_to_user(argp, &h, sizeof(*argp)))
- return -EFAULT;
- return 0;
- case F_SET_RW_HINT:
- if (copy_from_user(&h, argp, sizeof(h)))
- return -EFAULT;
- hint = (enum rw_hint) h;
- if (!rw_hint_valid(hint))
- return -EINVAL;
+ if (copy_to_user(argp, &hint, sizeof(*argp)))
+ return -EFAULT;
+ return 0;
+}
- inode_lock(inode);
- inode->i_write_hint = hint;
- inode_unlock(inode);
- return 0;
- default:
+static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct inode *inode = file_inode(file);
+ u64 __user *argp = (u64 __user *)arg;
+ u64 hint;
+
+ if (copy_from_user(&hint, argp, sizeof(hint)))
+ return -EFAULT;
+ if (!rw_hint_valid(hint))
return -EINVAL;
- }
+
+ WRITE_ONCE(inode->i_write_hint, hint);
+
+ /*
+ * file->f_mapping->host may differ from inode. As an example,
+ * blkdev_open() modifies file->f_mapping.
+ */
+ if (file->f_mapping->host != inode)
+ WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
+
+ return 0;
}
static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
+ err = fcntl_get_rw_hint(filp, cmd, arg);
+ break;
case F_SET_RW_HINT:
- err = fcntl_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, cmd, arg);
break;
default:
break;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f1..789af5c8fade 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -2465,7 +2465,7 @@ out:
}
static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset)
+ loff_t offset, unsigned int len)
{
int ret;
diff --git a/fs/inode.c b/fs/inode.c
index 5c8a5250d0ac..d290f007b3d1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
+#include <linux/rw_hint.h>
#include <trace/events/writeback.h>
#include "internal.h"
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..4e8e41c8b3c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
*/
#include <linux/module.h>
#include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
return test_bit(block + blks_per_folio, ifs->state);
}
+static unsigned ifs_find_dirty_range(struct folio *folio,
+ struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+ struct inode *inode = folio->mapping->host;
+ unsigned start_blk =
+ offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+ unsigned end_blk = min_not_zero(
+ offset_in_folio(folio, range_end) >> inode->i_blkbits,
+ i_blocks_per_folio(inode, folio));
+ unsigned nblks = 1;
+
+ while (!ifs_block_is_dirty(folio, ifs, start_blk))
+ if (++start_blk == end_blk)
+ return 0;
+
+ while (start_blk + nblks < end_blk) {
+ if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+ break;
+ nblks++;
+ }
+
+ *range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+ return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+ u64 range_end)
+{
+ struct iomap_folio_state *ifs = folio->private;
+
+ if (*range_start >= range_end)
+ return 0;
+
+ if (ifs)
+ return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+ return range_end - *range_start;
+}
+
static void ifs_clear_range_dirty(struct folio *folio,
struct iomap_folio_state *ifs, size_t off, size_t len)
{
@@ -1454,15 +1492,10 @@ out_unlock:
EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
- size_t len, int error)
+ size_t len)
{
struct iomap_folio_state *ifs = folio->private;
- if (error) {
- folio_set_error(folio);
- mapping_set_error(inode->i_mapping, error);
- }
-
WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
@@ -1479,40 +1512,29 @@ static u32
iomap_finish_ioend(struct iomap_ioend *ioend, int error)
{
struct inode *inode = ioend->io_inode;
- struct bio *bio = &ioend->io_inline_bio;
- struct bio *last = ioend->io_bio, *next;
- u64 start = bio->bi_iter.bi_sector;
- loff_t offset = ioend->io_offset;
- bool quiet = bio_flagged(bio, BIO_QUIET);
+ struct bio *bio = &ioend->io_bio;
+ struct folio_iter fi;
u32 folio_count = 0;
- for (bio = &ioend->io_inline_bio; bio; bio = next) {
- struct folio_iter fi;
-
- /*
- * For the last bio, bi_private points to the ioend, so we
- * need to explicitly end the iteration here.
- */
- if (bio == last)
- next = NULL;
- else
- next = bio->bi_private;
-
- /* walk all folios in bio, ending page IO on them */
- bio_for_each_folio_all(fi, bio) {
- iomap_finish_folio_write(inode, fi.folio, fi.length,
- error);
- folio_count++;
+ if (error) {
+ mapping_set_error(inode->i_mapping, error);
+ if (!bio_flagged(bio, BIO_QUIET)) {
+ pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+ inode->i_sb->s_id, inode->i_ino,
+ ioend->io_offset, ioend->io_sector);
}
- bio_put(bio);
}
- /* The ioend has been freed by bio_put() */
- if (unlikely(error && !quiet)) {
- printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
- inode->i_sb->s_id, inode->i_ino, offset, start);
+ /* walk all folios in bio, ending page IO on them */
+ bio_for_each_folio_all(fi, bio) {
+ if (error)
+ folio_set_error(fi.folio);
+ iomap_finish_folio_write(inode, fi.folio, fi.length);
+ folio_count++;
}
+
+ bio_put(bio); /* frees the ioend */
return folio_count;
}
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
static bool
iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
{
- if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+ if (ioend->io_bio.bi_status != next->io_bio.bi_status)
return false;
if ((ioend->io_flags & IOMAP_F_SHARED) ^
(next->io_flags & IOMAP_F_SHARED))
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
static void iomap_writepage_end_bio(struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
-
- iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+ iomap_finish_ioend(iomap_ioend_from_bio(bio),
+ blk_status_to_errno(bio->bi_status));
}
/*
* Submit the final bio for an ioend.
*
* If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them. In this situation, we need to fail the bio instead of
- * submitting it. This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
*/
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
- int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
{
- ioend->io_bio->bi_private = ioend;
- ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+ if (!wpc->ioend)
+ return error;
+ /*
+ * Let the file systems prepare the I/O submission and hook in an I/O
+ * comletion handler. This also needs to happen in case after a
+ * failure happened so that the file system end I/O handler gets called
+ * to clean up.
+ */
if (wpc->ops->prepare_ioend)
- error = wpc->ops->prepare_ioend(ioend, error);
+ error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
if (error) {
- /*
- * If we're failing the IO now, just mark the ioend with an
- * error and finish it. This will run IO completion immediately
- * as there is only one reference to the ioend at this point in
- * time.
- */
- ioend->io_bio->bi_status = errno_to_blk_status(error);
- bio_endio(ioend->io_bio);
- return error;
+ wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+ bio_endio(&wpc->ioend->io_bio);
+ } else {
+ submit_bio(&wpc->ioend->io_bio);
}
- submit_bio(ioend->io_bio);
- return 0;
+ wpc->ioend = NULL;
+ return error;
}
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
- loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct inode *inode, loff_t pos)
{
struct iomap_ioend *ioend;
struct bio *bio;
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS, &iomap_ioend_bioset);
- bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+ bio->bi_end_io = iomap_writepage_end_bio;
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
- ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+ ioend = iomap_ioend_from_bio(bio);
INIT_LIST_HEAD(&ioend->io_list);
ioend->io_type = wpc->iomap.type;
ioend->io_flags = wpc->iomap.flags;
ioend->io_inode = inode;
ioend->io_size = 0;
- ioend->io_folios = 0;
- ioend->io_offset = offset;
- ioend->io_bio = bio;
- ioend->io_sector = sector;
- return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
- struct bio *new;
-
- new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
- bio_clone_blkg_association(new, prev);
- new->bi_iter.bi_sector = bio_end_sector(prev);
+ ioend->io_offset = pos;
+ ioend->io_sector = bio->bi_iter.bi_sector;
- bio_chain(prev, new);
- bio_get(prev); /* for iomap_finish_ioend */
- submit_bio(prev);
- return new;
+ wpc->nr_folios = 0;
+ return ioend;
}
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
- sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
{
if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
(wpc->ioend->io_flags & IOMAP_F_SHARED))
return false;
if (wpc->iomap.type != wpc->ioend->io_type)
return false;
- if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+ if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
return false;
- if (sector != bio_end_sector(wpc->ioend->io_bio))
+ if (iomap_sector(&wpc->iomap, pos) !=
+ bio_end_sector(&wpc->ioend->io_bio))
return false;
/*
* Limit ioend bio chain lengths to minimise IO completion latency. This
* also prevents long tight loops ending page writeback on all the
* folios in the ioend.
*/
- if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+ if (wpc->nr_folios >= IOEND_BATCH_SIZE)
return false;
return true;
}
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
/*
* Test to see if we have an existing ioend structure that we could append to
* first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly. Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
*/
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
- struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct list_head *iolist)
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, loff_t pos, unsigned len)
{
- sector_t sector = iomap_sector(&wpc->iomap, pos);
- unsigned len = i_blocksize(inode);
+ struct iomap_folio_state *ifs = folio->private;
size_t poff = offset_in_folio(folio, pos);
+ int error;
- if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
- if (wpc->ioend)
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+ if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
+ error = iomap_submit_ioend(wpc, 0);
+ if (error)
+ return error;
+ wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
}
- if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
- wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
- bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
- }
+ if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+ goto new_ioend;
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
wbc_account_cgroup_owner(wbc, &folio->page, len);
+ return 0;
}
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected. While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
- struct writeback_control *wbc, struct inode *inode,
- struct folio *folio, u64 end_pos)
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio,
+ struct inode *inode, u64 pos, unsigned dirty_len,
+ unsigned *count)
{
- struct iomap_folio_state *ifs = folio->private;
- struct iomap_ioend *ioend, *next;
- unsigned len = i_blocksize(inode);
- unsigned nblocks = i_blocks_per_folio(inode, folio);
- u64 pos = folio_pos(folio);
- int error = 0, count = 0, i;
- LIST_HEAD(submit_list);
-
- WARN_ON_ONCE(end_pos <= pos);
-
- if (!ifs && nblocks > 1) {
- ifs = ifs_alloc(inode, folio, 0);
- iomap_set_range_dirty(folio, 0, end_pos - pos);
- }
+ int error;
- WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
-
- /*
- * Walk through the folio to find areas to write back. If we
- * run off the end of the current map or find the current map
- * invalid, grab a new one.
- */
- for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
- if (ifs && !ifs_block_is_dirty(folio, ifs, i))
- continue;
+ do {
+ unsigned map_len;
- error = wpc->ops->map_blocks(wpc, inode, pos);
+ error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
if (error)
break;
- trace_iomap_writepage_map(inode, &wpc->iomap);
- if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
- continue;
- if (wpc->iomap.type == IOMAP_HOLE)
- continue;
- iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
- &submit_list);
- count++;
- }
- if (count)
- wpc->ioend->io_folios++;
+ trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
- WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
- WARN_ON_ONCE(!folio_test_locked(folio));
- WARN_ON_ONCE(folio_test_writeback(folio));
- WARN_ON_ONCE(folio_test_dirty(folio));
+ map_len = min_t(u64, dirty_len,
+ wpc->iomap.offset + wpc->iomap.length - pos);
+ WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+ switch (wpc->iomap.type) {
+ case IOMAP_INLINE:
+ WARN_ON_ONCE(1);
+ error = -EIO;
+ break;
+ case IOMAP_HOLE:
+ break;
+ default:
+ error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+ map_len);
+ if (!error)
+ (*count)++;
+ break;
+ }
+ dirty_len -= map_len;
+ pos += map_len;
+ } while (dirty_len && !error);
/*
* We cannot cancel the ioend directly here on error. We may have
* already set other pages under writeback and hence we have to run I/O
* completion to mark the error state of the pages under writeback
* appropriately.
+ *
+ * Just let the file system know what portion of the folio failed to
+ * map.
*/
- if (unlikely(error)) {
- /*
- * Let the filesystem know what portion of the current page
- * failed to map. If the page hasn't been added to ioend, it
- * won't be affected by I/O completion and we must unlock it
- * now.
- */
- if (wpc->ops->discard_folio)
- wpc->ops->discard_folio(folio, pos);
- if (!count) {
- folio_unlock(folio);
- goto done;
- }
- }
-
- /*
- * We can have dirty bits set past end of file in page_mkwrite path
- * while mapping the last partial folio. Hence it's better to clear
- * all the dirty bits in the folio here.
- */
- iomap_clear_range_dirty(folio, 0, folio_size(folio));
- folio_start_writeback(folio);
- folio_unlock(folio);
-
- /*
- * Preserve the original error if there was one; catch
- * submission errors here and propagate into subsequent ioend
- * submissions.
- */
- list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
- int error2;
-
- list_del_init(&ioend->io_list);
- error2 = iomap_submit_ioend(wpc, ioend, error);
- if (error2 && !error)
- error = error2;
- }
-
- /*
- * We can end up here with no error and nothing to write only if we race
- * with a partial page truncate on a sub-page block sized filesystem.
- */
- if (!count)
- folio_end_writeback(folio);
-done:
- mapping_set_error(inode->i_mapping, error);
+ if (error && wpc->ops->discard_folio)
+ wpc->ops->discard_folio(folio, pos);
return error;
}
/*
- * Write out a dirty page.
+ * Check interaction of the folio with the file end.
*
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
+ * If the folio is entirely beyond i_size, return false. If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
*/
-static int iomap_do_writepage(struct folio *folio,
- struct writeback_control *wbc, void *data)
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+ u64 *end_pos)
{
- struct iomap_writepage_ctx *wpc = data;
- struct inode *inode = folio->mapping->host;
- u64 end_pos, isize;
-
- trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
+ u64 isize = i_size_read(inode);
- /*
- * Refuse to write the folio out if we're called from reclaim context.
- *
- * This avoids stack overflows when called from deeply used stacks in
- * random callers for direct reclaim or memcg reclaim. We explicitly
- * allow reclaim from kswapd as the stack usage there is relatively low.
- *
- * This should never happen except in the case of a VM regression so
- * warn about it.
- */
- if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
- PF_MEMALLOC))
- goto redirty;
-
- /*
- * Is this folio beyond the end of the file?
- *
- * The folio index is less than the end_index, adjust the end_pos
- * to the highest offset that this folio should represent.
- * -----------------------------------------------------
- * | file mapping | <EOF> |
- * -----------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | |
- * ^--------------------------------^----------|--------
- * | desired writeback range | see else |
- * ---------------------------------^------------------|
- */
- isize = i_size_read(inode);
- end_pos = folio_pos(folio) + folio_size(folio);
- if (end_pos > isize) {
- /*
- * Check whether the page to write out is beyond or straddles
- * i_size or not.
- * -------------------------------------------------------
- * | file mapping | <EOF> |
- * -------------------------------------------------------
- * | Page ... | Page N-2 | Page N-1 | Page N | Beyond |
- * ^--------------------------------^-----------|---------
- * | | Straddles |
- * ---------------------------------^-----------|--------|
- */
+ if (*end_pos > isize) {
size_t poff = offset_in_folio(folio, isize);
pgoff_t end_index = isize >> PAGE_SHIFT;
/*
- * Skip the page if it's fully outside i_size, e.g.
- * due to a truncate operation that's in progress. We've
- * cleaned this page and truncate will finish things off for
- * us.
+ * If the folio is entirely ouside of i_size, skip it.
+ *
+ * This can happen due to a truncate operation that is in
+ * progress and in that case truncate will finish it off once
+ * we've dropped the folio lock.
*
- * Note that the end_index is unsigned long. If the given
- * offset is greater than 16TB on a 32-bit system then if we
- * checked if the page is fully outside i_size with
- * "if (page->index >= end_index + 1)", "end_index + 1" would
- * overflow and evaluate to 0. Hence this page would be
+ * Note that the pgoff_t used for end_index is an unsigned long.
+ * If the given offset is greater than 16TB on a 32-bit system,
+ * then if we checked if the folio is fully outside i_size with
+ * "if (folio->index >= end_index + 1)", "end_index + 1" would
+ * overflow and evaluate to 0. Hence this folio would be
* redirtied and written out repeatedly, which would result in
* an infinite loop; the user program performing this operation
* would hang. Instead, we can detect this situation by
- * checking if the page is totally beyond i_size or if its
+ * checking if the folio is totally beyond i_size or if its
* offset is just equal to the EOF.
*/
if (folio->index > end_index ||
(folio->index == end_index && poff == 0))
- goto unlock;
+ return false;
/*
- * The page straddles i_size. It must be zeroed out on each
- * and every writepage invocation because it may be mmapped.
- * "A file is mapped in multiples of the page size. For a file
- * that is not a multiple of the page size, the remaining
- * memory is zeroed when mapped, and writes to that region are
- * not written out to the file."
+ * The folio straddles i_size.
+ *
+ * It must be zeroed out on each and every writepage invocation
+ * because it may be mmapped:
+ *
+ * A file is mapped in multiples of the page size. For a
+ * file that is not a multiple of the page size, the
+ * remaining memory is zeroed when mapped, and writes to that
+ * region are not written out to the file.
+ *
+ * Also adjust the writeback range to skip all blocks entirely
+ * beyond i_size.
*/
folio_zero_segment(folio, poff, folio_size(folio));
- end_pos = isize;
+ *end_pos = round_up(isize, i_blocksize(inode));
+ }
+
+ return true;
+}
+
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+ struct writeback_control *wbc, struct folio *folio)
+{
+ struct iomap_folio_state *ifs = folio->private;
+ struct inode *inode = folio->mapping->host;
+ u64 pos = folio_pos(folio);
+ u64 end_pos = pos + folio_size(folio);
+ unsigned count = 0;
+ int error = 0;
+ u32 rlen;
+
+ WARN_ON_ONCE(!folio_test_locked(folio));
+ WARN_ON_ONCE(folio_test_dirty(folio));
+ WARN_ON_ONCE(folio_test_writeback(folio));
+
+ trace_iomap_writepage(inode, pos, folio_size(folio));
+
+ if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+ folio_unlock(folio);
+ return 0;
+ }
+ WARN_ON_ONCE(end_pos <= pos);
+
+ if (i_blocks_per_folio(inode, folio) > 1) {
+ if (!ifs) {
+ ifs = ifs_alloc(inode, folio, 0);
+ iomap_set_range_dirty(folio, 0, end_pos - pos);
+ }
+
+ /*
+ * Keep the I/O completion handler from clearing the writeback
+ * bit until we have submitted all blocks by adding a bias to
+ * ifs->write_bytes_pending, which is dropped after submitting
+ * all blocks.
+ */
+ WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+ atomic_inc(&ifs->write_bytes_pending);
}
- return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+ /*
+ * Set the writeback bit ASAP, as the I/O completion for the single
+ * block per folio case happen hit as soon as we're submitting the bio.
+ */
+ folio_start_writeback(folio);
-redirty:
- folio_redirty_for_writepage(wbc, folio);
-unlock:
+ /*
+ * Walk through the folio to find dirty areas to write back.
+ */
+ while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+ error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+ pos, rlen, &count);
+ if (error)
+ break;
+ pos += rlen;
+ }
+
+ if (count)
+ wpc->nr_folios++;
+
+ /*
+ * We can have dirty bits set past end of file in page_mkwrite path
+ * while mapping the last partial folio. Hence it's better to clear
+ * all the dirty bits in the folio here.
+ */
+ iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+ /*
+ * Usually the writeback bit is cleared by the I/O completion handler.
+ * But we may end up either not actually writing any blocks, or (when
+ * there are multiple blocks in a folio) all I/O might have finished
+ * already at this point. In that case we need to clear the writeback
+ * bit ourselves right after unlocking the page.
+ */
folio_unlock(folio);
- return 0;
+ if (ifs) {
+ if (atomic_dec_and_test(&ifs->write_bytes_pending))
+ folio_end_writeback(folio);
+ } else {
+ if (!count)
+ folio_end_writeback(folio);
+ }
+ mapping_set_error(inode->i_mapping, error);
+ return error;
+}
+
+static int iomap_do_writepage(struct folio *folio,
+ struct writeback_control *wbc, void *data)
+{
+ return iomap_writepage_map(data, wbc, folio);
}
int
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
{
int ret;
+ /*
+ * Writeback from reclaim context should never happen except in the case
+ * of a VM regression so warn about it and refuse to write the data.
+ */
+ if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+ PF_MEMALLOC))
+ return -EIO;
+
wpc->ops = ops;
ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
- if (!wpc->ioend)
- return ret;
- return iomap_submit_ioend(wpc, wpc->ioend, ret);
+ return iomap_submit_ioend(wpc, ret);
}
EXPORT_SYMBOL_GPL(iomap_writepages);
static int __init iomap_init(void)
{
return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
- offsetof(struct iomap_ioend, io_inline_bio),
+ offsetof(struct iomap_ioend, io_bio),
BIOSET_NEED_BVECS);
}
fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..f3b43d223a46 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+ bio->bi_write_hint = inode->i_write_hint;
bio->bi_ioprio = dio->iocb->ki_ioprio;
bio->bi_private = dio;
bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..0a991c4ce87d 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name, \
TP_ARGS(inode, iomap))
DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-DEFINE_IOMAP_EVENT(iomap_writepage_map);
+
+TRACE_EVENT(iomap_writepage_map,
+ TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
+ struct iomap *iomap),
+ TP_ARGS(inode, pos, dirty_len, iomap),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(u64, ino)
+ __field(u64, pos)
+ __field(u64, dirty_len)
+ __field(u64, addr)
+ __field(loff_t, offset)
+ __field(u64, length)
+ __field(u16, type)
+ __field(u16, flags)
+ __field(dev_t, bdev)
+ ),
+ TP_fast_assign(
+ __entry->dev = inode->i_sb->s_dev;
+ __entry->ino = inode->i_ino;
+ __entry->pos = pos;
+ __entry->dirty_len = dirty_len;
+ __entry->addr = iomap->addr;
+ __entry->offset = iomap->offset;
+ __entry->length = iomap->length;
+ __entry->type = iomap->type;
+ __entry->flags = iomap->flags;
+ __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
+ "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ MAJOR(__entry->bdev), MINOR(__entry->bdev),
+ __entry->pos,
+ __entry->dirty_len,
+ __entry->addr,
+ __entry->offset,
+ __entry->length,
+ __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+ __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+);
TRACE_EVENT(iomap_iter,
TP_PROTO(struct iomap_iter *iter, const void *ops,
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
__field(u64, ino)
__field(loff_t, pos)
__field(u64, length)
+ __field(s64, processed)
__field(unsigned int, flags)
__field(const void *, ops)
__field(unsigned long, caller)
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
__entry->ino = iter->inode->i_ino;
__entry->pos = iter->pos;
__entry->length = iomap_length(iter);
+ __entry->processed = iter->processed;
__entry->flags = iter->flags;
__entry->ops = ops;
__entry->caller = caller;
),
- TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
+ TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->ino,
__entry->pos,
__entry->length,
+ __entry->processed,
__print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
__entry->flags,
__entry->ops,
diff --git a/fs/mpage.c b/fs/mpage.c
index 738882e0766d..fa8b99a199fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,7 @@ alloc_new:
GFP_NOFS);
bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
+ bio->bi_write_hint = inode->i_write_hint;
}
/*
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..1698507d1ac7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
* longer dirty. If we don't remove delalloc blocks here, they become
* stale and can corrupt free space accounting on unmount.
*/
- error = blk_status_to_errno(ioend->io_bio->bi_status);
+ error = blk_status_to_errno(ioend->io_bio.bi_status);
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
xfs_end_bio(
struct bio *bio)
{
- struct iomap_ioend *ioend = bio->bi_private;
+ struct iomap_ioend *ioend = iomap_ioend_from_bio(bio);
struct xfs_inode *ip = XFS_I(ioend->io_inode);
unsigned long flags;
@@ -276,7 +276,8 @@ static int
xfs_map_blocks(
struct iomap_writepage_ctx *wpc,
struct inode *inode,
- loff_t offset)
+ loff_t offset,
+ unsigned int len)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
/* send ioends that might require a transaction to the completion wq */
if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
(ioend->io_flags & IOMAP_F_SHARED))
- ioend->io_bio->bi_end_io = xfs_end_bio;
+ ioend->io_bio.bi_end_io = xfs_end_bio;
return status;
}
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index dba5dcb62bef..3b103715acc9 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
* which implies that the page range can only be within the fixed inode size.
*/
static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+ struct inode *inode, loff_t offset,
+ unsigned int len)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7a8150a5f051..492b0128b5d9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -8,6 +8,7 @@
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
+#include <linux/rw_hint.h>
struct blk_mq_tags;
struct blk_flush_queue;
@@ -135,6 +136,7 @@ struct request {
struct blk_crypto_keyslot *crypt_keyslot;
#endif
+ enum rw_hint write_hint;
unsigned short ioprio;
enum mq_rq_state state;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index f288c94374b3..12d87cef2c03 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -10,6 +10,7 @@
#include <linux/bvec.h>
#include <linux/device.h>
#include <linux/ktime.h>
+#include <linux/rw_hint.h>
struct bio_set;
struct bio;
@@ -269,6 +270,7 @@ struct bio {
*/
unsigned short bi_flags; /* BIO_* below */
unsigned short bi_ioprio;
+ enum rw_hint bi_write_hint;
blk_status_t bi_status;
atomic_t __bi_remaining;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba751d097c1..b26796ccc309 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -44,6 +44,7 @@
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
+#include <linux/rw_hint.h>
#include <asm/byteorder.h>
#include <uapi/linux/fs.h>
@@ -310,19 +311,6 @@ struct address_space;
struct writeback_control;
struct readahead_control;
-/*
- * Write life time hint values.
- * Stored in struct inode as u8.
- */
-enum rw_hint {
- WRITE_LIFE_NOT_SET = 0,
- WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
- WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
- WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
- WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
- WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
-};
-
/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI (__force int) RWF_HIPRI
#define IOCB_DSYNC (__force int) RWF_DSYNC
@@ -680,7 +668,7 @@ struct inode {
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits;
- u8 i_write_hint;
+ enum rw_hint i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 96dd0acbba44..6fc1c858013d 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -293,22 +293,32 @@ struct iomap_ioend {
struct list_head io_list; /* next ioend in chain */
u16 io_type;
u16 io_flags; /* IOMAP_F_* */
- u32 io_folios; /* folios added to ioend */
struct inode *io_inode; /* file being written to */
size_t io_size; /* size of the extent */
loff_t io_offset; /* offset in the file */
sector_t io_sector; /* start sector of ioend */
- struct bio *io_bio; /* bio being built */
- struct bio io_inline_bio; /* MUST BE LAST! */
+ struct bio io_bio; /* MUST BE LAST! */
};
+static inline struct iomap_ioend *iomap_ioend_from_bio(struct bio *bio)
+{
+ return container_of(bio, struct iomap_ioend, io_bio);
+}
+
struct iomap_writeback_ops {
/*
* Required, maps the blocks so that writeback can be performed on
* the range starting at offset.
+ *
+ * Can return arbitrarily large regions, but we need to call into it at
+ * least once per folio to allow the file systems to synchronize with
+ * the write path that could be invalidating mappings.
+ *
+ * An existing mapping from a previous call to this method can be reused
+ * by the file system if it is still valid.
*/
int (*map_blocks)(struct iomap_writepage_ctx *wpc, struct inode *inode,
- loff_t offset);
+ loff_t offset, unsigned len);
/*
* Optional, allows the file systems to perform actions just before
@@ -329,6 +339,7 @@ struct iomap_writepage_ctx {
struct iomap iomap;
struct iomap_ioend *ioend;
const struct iomap_writeback_ops *ops;
+ u32 nr_folios; /* folios added to the ioend */
};
void iomap_finish_ioends(struct iomap_ioend *ioend, int error);
diff --git a/include/linux/rw_hint.h b/include/linux/rw_hint.h
new file mode 100644
index 000000000000..309ca72f2dfb
--- /dev/null
+++ b/include/linux/rw_hint.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RW_HINT_H
+#define _LINUX_RW_HINT_H
+
+#include <linux/build_bug.h>
+#include <linux/compiler_attributes.h>
+#include <uapi/linux/fcntl.h>
+
+/* Block storage write lifetime hint values. */
+enum rw_hint {
+ WRITE_LIFE_NOT_SET = RWH_WRITE_LIFE_NOT_SET,
+ WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE,
+ WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT,
+ WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM,
+ WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG,
+ WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME,
+} __packed;
+
+/* Sparse ignores __packed annotations on enums, hence the #ifndef below. */
+#ifndef __CHECKER__
+static_assert(sizeof(enum rw_hint) == 1);
+#endif
+
+#endif /* _LINUX_RW_HINT_H */