From 30da1b45b130c70945b033900f45c9d61d6f3b4a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:14 +0200 Subject: io_uring: fix a layering violation in io_iopoll_req_issued syscall-level code can't just poke into the details of the poll cookie, which is private information of the block layer. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20211012111226.760968-5-hch@lst.de Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e68d27829bb2..d2e86788c872 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2738,19 +2738,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req) ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - unsigned int queue_num0, queue_num1; list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, inflight_entry); - if (list_req->file != req->file) { + if (list_req->file != req->file) ctx->poll_multi_queue = true; - } else { - queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); - queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); - if (queue_num0 != queue_num1) - ctx->poll_multi_queue = true; - } } /* -- cgit v1.2.3 From ef99b2d37666b7a600baab9e1c4944436652b0a2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:19 +0200 Subject: block: replace the spin argument to blk_iopoll with a flags argument Switch the boolean spin argument to blk_poll to passing a set of flags instead. This will allow to control polling behavior in a more fine grained way. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-10-hch@lst.de [axboe: adapt to changed io_uring iopoll] Signed-off-by: Jens Axboe --- block/blk-exec.c | 2 +- block/blk-mq.c | 17 +++++++---------- block/fops.c | 8 ++++---- fs/io_uring.c | 9 +++++---- fs/iomap/direct-io.c | 6 +++--- include/linux/blkdev.h | 4 +++- include/linux/fs.h | 2 +- include/linux/iomap.h | 2 +- mm/page_io.c | 2 +- 9 files changed, 26 insertions(+), 26 deletions(-) (limited to 'fs/io_uring.c') diff --git a/block/blk-exec.c b/block/blk-exec.c index d6cd501c0d34..1fa7f25e5726 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -71,7 +71,7 @@ static bool blk_rq_is_poll(struct request *rq) static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), true); + blk_poll(rq->q, request_to_qc_t(rq->mq_hctx, rq), 0); cond_resched(); } while (!completion_done(wait)); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 7d0d947921a6..6609e10657a8 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4052,7 +4052,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) } static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - bool spin) + unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); @@ -4075,7 +4075,7 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, if (task_is_running(current)) return 1; - if (ret < 0 || !spin) + if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); @@ -4088,15 +4088,13 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, * blk_poll - poll for IO completions * @q: the queue * @cookie: cookie passed back at IO submission time - * @spin: whether to spin for completions + * @flags: BLK_POLL_* flags that control the behavior * * Description: * Poll for completions on the passed in queue. Returns number of - * completed entries found. If @spin is true, then blk_poll will continue - * looping until at least one completion is found, unless the task is - * otherwise marked running (or we need to reschedule). + * completed entries found. */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) { if (cookie == BLK_QC_T_NONE || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) @@ -4105,12 +4103,11 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) if (current->plug) blk_flush_plug_list(current->plug, false); - /* If specified not to spin, we also should not sleep. */ - if (spin && q->poll_nsec != BLK_MQ_POLL_CLASSIC) { + if (q->poll_nsec != BLK_MQ_POLL_CLASSIC) { if (blk_mq_poll_hybrid(q, cookie)) return 1; } - return blk_mq_poll_classic(q, cookie, spin); + return blk_mq_poll_classic(q, cookie, flags); } EXPORT_SYMBOL_GPL(blk_poll); diff --git a/block/fops.c b/block/fops.c index 15324f2e5a91..db8f2fe68dd2 100644 --- a/block/fops.c +++ b/block/fops.c @@ -108,7 +108,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) + !blk_poll(bdev_get_queue(bdev), qc, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -141,12 +141,12 @@ struct blkdev_dio { static struct bio_set blkdev_dio_pool; -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) +static int blkdev_iopoll(struct kiocb *kiocb, unsigned int flags) { struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); struct request_queue *q = bdev_get_queue(bdev); - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), flags); } static void blkdev_bio_end_io(struct bio *bio) @@ -297,7 +297,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->waiter)) break; - if (!do_poll || !blk_poll(bdev_get_queue(bdev), qc, true)) + if (!do_poll || !blk_poll(bdev_get_queue(bdev), qc, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/fs/io_uring.c b/fs/io_uring.c index d2e86788c872..541fec2bd49a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2457,14 +2457,15 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { struct io_kiocb *req, *tmp; + unsigned int poll_flags = 0; LIST_HEAD(done); - bool spin; /* * Only spin for completions if we don't have multiple devices hanging * off our complete list, and we're under the requested amount. */ - spin = !ctx->poll_multi_queue && *nr_events < min; + if (ctx->poll_multi_queue || *nr_events >= min) + poll_flags |= BLK_POLL_ONESHOT; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; @@ -2482,11 +2483,11 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) - spin = false; + poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 560ae967f70e..236aba256cd1 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -49,13 +49,13 @@ struct iomap_dio { }; }; -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) +int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags) { struct request_queue *q = READ_ONCE(kiocb->private); if (!q) return 0; - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); + return blk_poll(q, READ_ONCE(kiocb->ki_cookie), flags); } EXPORT_SYMBOL_GPL(iomap_dio_iopoll); @@ -642,7 +642,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!(iocb->ki_flags & IOCB_HIPRI) || !dio->submit.last_queue || !blk_poll(dio->submit.last_queue, - dio->submit.cookie, true)) + dio->submit.cookie, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 17705c970d7e..e177346bc020 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -564,7 +564,9 @@ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); +/* only poll the hardware once, don't continue until a completion was found */ +#define BLK_POLL_ONESHOT (1 << 0) +int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { diff --git a/include/linux/fs.h b/include/linux/fs.h index e7a633353fd2..c443cddf414f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2075,7 +2075,7 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, bool spin); + int (*iopoll)(struct kiocb *kiocb, unsigned int flags); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 24f8489583ca..1e86b65567c2 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -337,7 +337,7 @@ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, unsigned int dio_flags); ssize_t iomap_dio_complete(struct iomap_dio *dio); -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); +int iomap_dio_iopoll(struct kiocb *kiocb, unsigned int flags); #ifdef CONFIG_SWAP struct file; diff --git a/mm/page_io.c b/mm/page_io.c index c493ce9ebcf5..5d5543fcefa4 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -428,7 +428,7 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!blk_poll(disk->queue, qc, true)) + if (!blk_poll(disk->queue, qc, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From d729cf9acb9311956c8a37113dcfa0160a2d9665 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 12 Oct 2021 13:12:20 +0200 Subject: io_uring: don't sleep when polling for I/O There is no point in sleeping for the expected I/O completion timeout in the io_uring async polling model as we never poll for a specific I/O. Signed-off-by: Christoph Hellwig Tested-by: Mark Wunderlich Link: https://lore.kernel.org/r/20211012111226.760968-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 ++- fs/io_uring.c | 2 +- include/linux/blkdev.h | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 6609e10657a8..97c24e461d0a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4103,7 +4103,8 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) if (current->plug) blk_flush_plug_list(current->plug, false); - if (q->poll_nsec != BLK_MQ_POLL_CLASSIC) { + if (!(flags & BLK_POLL_NOSLEEP) && + q->poll_nsec != BLK_MQ_POLL_CLASSIC) { if (blk_mq_poll_hybrid(q, cookie)) return 1; } diff --git a/fs/io_uring.c b/fs/io_uring.c index 541fec2bd49a..c5066146b8de 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2457,7 +2457,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, long min) { struct io_kiocb *req, *tmp; - unsigned int poll_flags = 0; + unsigned int poll_flags = BLK_POLL_NOSLEEP; LIST_HEAD(done); /* diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e177346bc020..2b80c98fc373 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -566,6 +566,8 @@ blk_status_t errno_to_blk_status(int errno); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) +/* do not sleep to wait for the expected completion time */ +#define BLK_POLL_NOSLEEP (1 << 1) int blk_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) -- cgit v1.2.3 From 5a72e899ceb465d731c413d57c6c12cdbf88303c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Oct 2021 09:24:29 -0600 Subject: block: add a struct io_comp_batch argument to fops->iopoll() struct io_comp_batch contains a list head and a completion handler, which will allow completions to more effciently completed batches of IO. For now, no functional changes in this patch, we just define the io_comp_batch structure and add the argument to the file_operations iopoll handler. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++---- block/blk-exec.c | 2 +- block/blk-mq.c | 9 +++++---- block/blk-mq.h | 3 ++- block/fops.c | 4 ++-- drivers/block/rnbd/rnbd-clt.c | 2 +- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 2 +- drivers/nvme/host/tcp.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- fs/io_uring.c | 2 +- fs/iomap/direct-io.c | 2 +- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 13 +++++++++++-- include/linux/fs.h | 4 +++- mm/page_io.c | 2 +- 16 files changed, 39 insertions(+), 25 deletions(-) (limited to 'fs/io_uring.c') diff --git a/block/blk-core.c b/block/blk-core.c index 20b6cc06461a..d0c2e11411d0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1078,7 +1078,7 @@ EXPORT_SYMBOL(submit_bio); * Note: the caller must either be the context that submitted @bio, or * be in a RCU critical section to prevent freeing of @bio. */ -int bio_poll(struct bio *bio, unsigned int flags) +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { struct request_queue *q = bio->bi_bdev->bd_disk->queue; blk_qc_t cookie = READ_ONCE(bio->bi_cookie); @@ -1096,7 +1096,7 @@ int bio_poll(struct bio *bio, unsigned int flags) if (WARN_ON_ONCE(!queue_is_mq(q))) ret = 0; /* not yet implemented, should not happen */ else - ret = blk_mq_poll(q, cookie, flags); + ret = blk_mq_poll(q, cookie, iob, flags); blk_queue_exit(q); return ret; } @@ -1106,7 +1106,8 @@ EXPORT_SYMBOL_GPL(bio_poll); * Helper to implement file_operations.iopoll. Requires the bio to be stored * in iocb->private, and cleared before freeing the bio. */ -int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags) +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags) { struct bio *bio; int ret = 0; @@ -1134,7 +1135,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags) rcu_read_lock(); bio = READ_ONCE(kiocb->private); if (bio && bio->bi_bdev) - ret = bio_poll(bio, flags); + ret = bio_poll(bio, iob, flags); rcu_read_unlock(); return ret; diff --git a/block/blk-exec.c b/block/blk-exec.c index 55f0cd34b37b..1b8b47f6e79b 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -77,7 +77,7 @@ static bool blk_rq_is_poll(struct request *rq) static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - bio_poll(rq->bio, 0); + bio_poll(rq->bio, NULL, 0); cond_resched(); } while (!completion_done(wait)); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 74505b545dd3..79c25b64e8b0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4174,14 +4174,14 @@ static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) } static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, - unsigned int flags) + struct io_comp_batch *iob, unsigned int flags) { struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); int ret; do { - ret = q->mq_ops->poll(hctx); + ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { __set_current_state(TASK_RUNNING); return ret; @@ -4201,14 +4201,15 @@ static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, return 0; } -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) { if (!(flags & BLK_POLL_NOSLEEP) && q->poll_nsec != BLK_MQ_POLL_CLASSIC) { if (blk_mq_poll_hybrid(q, cookie)) return 1; } - return blk_mq_poll_classic(q, cookie, flags); + return blk_mq_poll_classic(q, cookie, iob, flags); } unsigned int blk_mq_rq_cpu(struct request *rq) diff --git a/block/blk-mq.h b/block/blk-mq.h index 1b91a3fdaa01..ebf67f4d4f2e 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -31,7 +31,8 @@ struct blk_mq_ctx { } ____cacheline_aligned_in_smp; void blk_mq_submit_bio(struct bio *bio); -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, unsigned int flags); +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); diff --git a/block/fops.c b/block/fops.c index 1d4f862950bb..2c43e493e37c 100644 --- a/block/fops.c +++ b/block/fops.c @@ -105,7 +105,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, 0)) + if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -291,7 +291,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->waiter)) break; - if (!do_poll || !bio_poll(bio, 0)) + if (!do_poll || !bio_poll(bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index bd4a41afbbfc..0ec0191d4196 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1176,7 +1176,7 @@ static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx, return ret; } -static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx) +static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct rnbd_queue *q = hctx->driver_data; struct rnbd_clt_dev *dev = q->dev; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 896328271471..bb0482dfab3c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1092,7 +1092,7 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } -static int nvme_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_queue *nvmeq = hctx->driver_data; bool found; @@ -1274,7 +1274,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * Did we miss an interrupt? */ if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) - nvme_poll(req->mq_hctx); + nvme_poll(req->mq_hctx, NULL); else nvme_poll_irqdisable(nvmeq); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 40317e1b9183..1624da3702d4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2106,7 +2106,7 @@ unmap_qe: return ret; } -static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_rdma_queue *queue = hctx->driver_data; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 3c1c29dd3020..9ce3458ee1dd 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2429,7 +2429,7 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) return 0; } -static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 33fd9a01330c..30f7d0b4eb73 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1784,7 +1784,7 @@ static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq, } -static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx) +static int scsi_mq_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct Scsi_Host *shost = hctx->driver_data; diff --git a/fs/io_uring.c b/fs/io_uring.c index c5066146b8de..cd77a137f2d8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2483,7 +2483,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, poll_flags); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 8efab177011d..83ecfba53abe 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -630,7 +630,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, break; if (!dio->submit.poll_bio || - !bio_poll(dio->submit.poll_bio, 0)) + !bio_poll(dio->submit.poll_bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9fb8618fb957..4c79439af2f2 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -532,7 +532,7 @@ struct blk_mq_ops { /** * @poll: Called to poll for completion of a specific tag. */ - int (*poll)(struct blk_mq_hw_ctx *); + int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b0a322172965..fd9771a1da09 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -569,8 +569,9 @@ blk_status_t errno_to_blk_status(int errno); #define BLK_POLL_ONESHOT (1 << 0) /* do not sleep to wait for the expected completion time */ #define BLK_POLL_NOSLEEP (1 << 1) -int bio_poll(struct bio *bio, unsigned int flags); -int iocb_bio_iopoll(struct kiocb *kiocb, unsigned int flags); +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags); +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { @@ -1298,6 +1299,14 @@ int fsync_bdev(struct block_device *bdev); int freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev); +struct io_comp_batch { + struct request *req_list; + bool need_ts; + void (*complete)(struct io_comp_batch *); +}; + +#define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } + #define rq_list_add(listptr, rq) do { \ (rq)->rq_next = *(listptr); \ *(listptr) = rq; \ diff --git a/include/linux/fs.h b/include/linux/fs.h index f595f4097cb7..31029a91f440 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -48,6 +48,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; +struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; @@ -2071,7 +2072,8 @@ struct file_operations { ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); - int (*iopoll)(struct kiocb *kiocb, unsigned int flags); + int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, + unsigned int flags); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); diff --git a/mm/page_io.c b/mm/page_io.c index a68faab5b310..6010fb07f231 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -424,7 +424,7 @@ int swap_readpage(struct page *page, bool synchronous) if (!READ_ONCE(bio->bi_private)) break; - if (!bio_poll(bio, 0)) + if (!bio_poll(bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From b688f11e86c9a22169a0e522530982735d2db19b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Oct 2021 09:28:46 -0600 Subject: io_uring: utilize the io batching infrastructure for more efficient polled IO Wire up using an io_comp_batch for f_op->iopoll(). If the lower stack supports it, we can handle high rates of polled IO more efficiently. This raises the single core efficiency on my system from ~6.1M IOPS to ~6.6M IOPS running a random read workload at depth 128 on two gen2 Optane drives. Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index cd77a137f2d8..d4631a55a692 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2458,6 +2458,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, { struct io_kiocb *req, *tmp; unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); LIST_HEAD(done); /* @@ -2483,17 +2484,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, if (!list_empty(&done)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, NULL, poll_flags); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (READ_ONCE(req->iopoll_completed)) + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) list_move_tail(&req->inflight_entry, &done); } + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); -- cgit v1.2.3