summaryrefslogtreecommitdiff
path: root/fs/iomap
diff options
context:
space:
mode:
authorDarrick J. Wong <djwong@kernel.org>2023-08-02 02:41:49 +0300
committerDarrick J. Wong <djwong@kernel.org>2023-08-02 02:41:49 +0300
commit377698d4abe2cd118dd866d5ef19e2f1aa6b9758 (patch)
tree4fc0e26800e058229bd1e0282f54a21f071e7a2e /fs/iomap
parenta67371b7aee931572eab313384e7fcd93255f6f8 (diff)
parent8c052fb3002e6e7eccdc138d2e459cd03f019ade (diff)
downloadlinux-377698d4abe2cd118dd866d5ef19e2f1aa6b9758.tar.xz
Merge tag 'xfs-async-dio.6-2023-08-01' of git://git.kernel.dk/linux into iomap-6.6-mergeA
Improve iomap/xfs async dio write performance iomap always punts async dio write completions to a workqueue, which has a cost in terms of efficiency (now you need an unrelated worker to process it) and latency (now you're bouncing a completion through an async worker, which is a classic slowdown scenario). io_uring handles IRQ completions via task_work, and for writes that don't need to do extra IO at completion time, we can safely complete them inline from that. This patchset adds IOCB_DIO_CALLER_COMP, which an IO issuer can set to inform the completion side that any extra work that needs doing for that completion can be punted to a safe task context. The iomap dio completion will happen in hard/soft irq context, and we need a saner context to process these completions. IOCB_DIO_CALLER_COMP is added, which can be set in a struct kiocb->ki_flags by the issuer. If the completion side of the iocb handling understands this flag, it can choose to set a kiocb->dio_complete() handler and just call ki_complete from IRQ context. The issuer must then ensure that this callback is processed from a task. io_uring punts IRQ completions to task_work already, so it's trivial wire it up to run more of the completion before posting a CQE. This is good for up to a 37% improvement in throughput/latency for low queue depth IO, patch 5 has the details. If we need to do real work at completion time, iomap will clear the IOMAP_DIO_CALLER_COMP flag. This work came about when Andres tested low queue depth dio writes for postgres and compared it to doing sync dio writes, showing that the async processing slows us down a lot. * tag 'xfs-async-dio.6-2023-08-01' of git://git.kernel.dk/linux: iomap: support IOCB_DIO_CALLER_COMP io_uring/rw: add write support for IOCB_DIO_CALLER_COMP fs: add IOCB flags related to passing back dio completions iomap: add IOMAP_DIO_INLINE_COMP iomap: only set iocb->private for polled bio iomap: treat a write through cache the same as FUA iomap: use an unsigned type for IOMAP_DIO_* defines iomap: cleanup up iomap_dio_bio_end_io() Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Diffstat (limited to 'fs/iomap')
-rw-r--r--fs/iomap/direct-io.c163
1 files changed, 123 insertions, 40 deletions
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index ea3b868c8355..bcd3f8cf5ea4 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -20,10 +20,12 @@
* Private flags for iomap_dio, must not overlap with the public ones in
* iomap.h:
*/
-#define IOMAP_DIO_WRITE_FUA (1 << 28)
-#define IOMAP_DIO_NEED_SYNC (1 << 29)
-#define IOMAP_DIO_WRITE (1 << 30)
-#define IOMAP_DIO_DIRTY (1 << 31)
+#define IOMAP_DIO_CALLER_COMP (1U << 26)
+#define IOMAP_DIO_INLINE_COMP (1U << 27)
+#define IOMAP_DIO_WRITE_THROUGH (1U << 28)
+#define IOMAP_DIO_NEED_SYNC (1U << 29)
+#define IOMAP_DIO_WRITE (1U << 30)
+#define IOMAP_DIO_DIRTY (1U << 31)
struct iomap_dio {
struct kiocb *iocb;
@@ -41,7 +43,6 @@ struct iomap_dio {
struct {
struct iov_iter *iter;
struct task_struct *waiter;
- struct bio *poll_bio;
} submit;
/* used for aio completion: */
@@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
+ struct kiocb *iocb = dio->iocb;
+
atomic_inc(&dio->ref);
/* Sync dio can't be polled reliably */
- if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) {
- bio_set_polled(bio, dio->iocb);
- dio->submit.poll_bio = bio;
+ if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) {
+ bio_set_polled(bio, iocb);
+ WRITE_ONCE(iocb->private, bio);
}
if (dio->dops && dio->dops->submit_io)
@@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
EXPORT_SYMBOL_GPL(iomap_dio_complete);
+static ssize_t iomap_dio_deferred_complete(void *data)
+{
+ return iomap_dio_complete(data);
+}
+
static void iomap_dio_complete_work(struct work_struct *work)
{
struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
@@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+ struct kiocb *iocb = dio->iocb;
if (bio->bi_status)
iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
+ if (!atomic_dec_and_test(&dio->ref))
+ goto release_bio;
- if (atomic_dec_and_test(&dio->ref)) {
- if (dio->wait_for_completion) {
- struct task_struct *waiter = dio->submit.waiter;
- WRITE_ONCE(dio->submit.waiter, NULL);
- blk_wake_io_task(waiter);
- } else if (dio->flags & IOMAP_DIO_WRITE) {
- struct inode *inode = file_inode(dio->iocb->ki_filp);
-
- WRITE_ONCE(dio->iocb->private, NULL);
- INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
- queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
- } else {
- WRITE_ONCE(dio->iocb->private, NULL);
- iomap_dio_complete_work(&dio->aio.work);
- }
+ /*
+ * Synchronous dio, task itself will handle any completion work
+ * that needs after IO. All we need to do is wake the task.
+ */
+ if (dio->wait_for_completion) {
+ struct task_struct *waiter = dio->submit.waiter;
+
+ WRITE_ONCE(dio->submit.waiter, NULL);
+ blk_wake_io_task(waiter);
+ goto release_bio;
+ }
+
+ /*
+ * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline
+ */
+ if (dio->flags & IOMAP_DIO_INLINE_COMP) {
+ WRITE_ONCE(iocb->private, NULL);
+ iomap_dio_complete_work(&dio->aio.work);
+ goto release_bio;
+ }
+
+ /*
+ * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule
+ * our completion that way to avoid an async punt to a workqueue.
+ */
+ if (dio->flags & IOMAP_DIO_CALLER_COMP) {
+ /* only polled IO cares about private cleared */
+ iocb->private = dio;
+ iocb->dio_complete = iomap_dio_deferred_complete;
+
+ /*
+ * Invoke ->ki_complete() directly. We've assigned our
+ * dio_complete callback handler, and since the issuer set
+ * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will
+ * notice ->dio_complete being set and will defer calling that
+ * handler until it can be done from a safe task context.
+ *
+ * Note that the 'res' being passed in here is not important
+ * for this case. The actual completion value of the request
+ * will be gotten from dio_complete when that is run by the
+ * issuer.
+ */
+ iocb->ki_complete(iocb, 0);
+ goto release_bio;
}
+ /*
+ * Async DIO completion that requires filesystem level completion work
+ * gets punted to a work queue to complete as the operation may require
+ * more IO to be issued to finalise filesystem metadata changes or
+ * guarantee data integrity.
+ */
+ INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+ queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &dio->aio.work);
+release_bio:
if (should_dirty) {
bio_check_pages_dirty(bio);
} else {
@@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
/*
* Figure out the bio's operation flags from the dio request, the
* mapping, and whether or not we want FUA. Note that we can end up
- * clearing the WRITE_FUA flag in the dio request.
+ * clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
const struct iomap *iomap, bool use_fua)
@@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
if (use_fua)
opflags |= REQ_FUA;
else
- dio->flags &= ~IOMAP_DIO_WRITE_FUA;
+ dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
return opflags;
}
@@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
* Use a FUA write if we need datasync semantics, this is a pure
* data IO that doesn't require any metadata updates (including
* after IO completion such as unwritten extent conversion) and
- * the underlying device supports FUA. This allows us to avoid
- * cache flushes on IO completion.
+ * the underlying device either supports FUA or doesn't have
+ * a volatile write cache. This allows us to avoid cache flushes
+ * on IO completion. If we can't use writethrough and need to
+ * sync, disable in-task completions as dio completion will
+ * need to call generic_write_sync() which will do a blocking
+ * fsync / cache flush call.
*/
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
- (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev))
+ (dio->flags & IOMAP_DIO_WRITE_THROUGH) &&
+ (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev)))
use_fua = true;
+ else if (dio->flags & IOMAP_DIO_NEED_SYNC)
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
}
/*
@@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
/*
- * We can only poll for single bio I/Os.
+ * We can only do deferred completion for pure overwrites that
+ * don't require additional IO at completion. This rules out
+ * writes that need zeroing or extent conversion, extend
+ * the file size, or issue journal IO or cache flushes
+ * during completion processing.
*/
if (need_zeroout ||
+ ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) ||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
+ dio->flags &= ~IOMAP_DIO_CALLER_COMP;
+
+ /*
+ * The rules for polled IO completions follow the guidelines as the
+ * ones we set for inline and deferred completions. If none of those
+ * are available for this IO, clear the polled flag.
+ */
+ if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP)))
dio->iocb->ki_flags &= ~IOCB_HIPRI;
if (need_zeroout) {
@@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.iter = iter;
dio->submit.waiter = current;
- dio->submit.poll_bio = NULL;
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
if (iov_iter_rw(iter) == READ) {
+ /* reads can always complete inline */
+ dio->flags |= IOMAP_DIO_INLINE_COMP;
+
if (iomi.pos >= dio->i_size)
goto out_free_dio;
@@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
+ /*
+ * Flag as supporting deferred completions, if the issuer
+ * groks it. This can avoid a workqueue punt for writes.
+ * We may later clear this flag if we need to do other IO
+ * as part of this IO completion.
+ */
+ if (iocb->ki_flags & IOCB_DIO_CALLER_COMP)
+ dio->flags |= IOMAP_DIO_CALLER_COMP;
+
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
ret = -EAGAIN;
if (iomi.pos >= dio->i_size ||
@@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->flags |= IOMAP_DIO_NEED_SYNC;
/*
- * For datasync only writes, we optimistically try
- * using FUA for this IO. Any non-FUA write that
- * occurs will clear this flag, hence we know before
- * completion whether a cache flush is necessary.
+ * For datasync only writes, we optimistically try using
+ * WRITE_THROUGH for this IO. This flag requires either
+ * FUA writes through the device's write cache, or a
+ * normal write to a device without a volatile write
+ * cache. For the former, Any non-FUA write that occurs
+ * will clear this flag, hence we know before completion
+ * whether a cache flush is necessary.
*/
if (!(iocb->ki_flags & IOCB_SYNC))
- dio->flags |= IOMAP_DIO_WRITE_FUA;
+ dio->flags |= IOMAP_DIO_WRITE_THROUGH;
}
/*
@@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
iomap_dio_set_error(dio, ret);
/*
- * If all the writes we issued were FUA, we don't need to flush the
- * cache on IO completion. Clear the sync flag for this case.
+ * If all the writes we issued were already written through to the
+ * media, we don't need to flush the cache on IO completion. Clear the
+ * sync flag for this case.
*/
- if (dio->flags & IOMAP_DIO_WRITE_FUA)
+ if (dio->flags & IOMAP_DIO_WRITE_THROUGH)
dio->flags &= ~IOMAP_DIO_NEED_SYNC;
- WRITE_ONCE(iocb->private, dio->submit.poll_bio);
-
/*
* We are about to drop our additional submission reference, which
* might be the last reference to the dio. There are three different