From da12d9ab5889b87429d9375748dcd1485b6241f3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:23 +0000 Subject: io_uring/cmd: move io_uring_try_cancel_uring_cmd() io_uring_try_cancel_uring_cmd() is a part of the cmd handling so let's move it closer to all cmd bits into uring_cmd.c Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/43a3937af4933655f0fd9362c381802f804f43de.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 39 +-------------------------------------- io_uring/io_uring.h | 7 +++++++ io_uring/uring_cmd.c | 30 ++++++++++++++++++++++++++++++ io_uring/uring_cmd.h | 3 +++ 4 files changed, 41 insertions(+), 38 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c170a2b8d2cf..d5ac91b5b171 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "waitid.h" #include "futex.h" #include "napi.h" +#include "uring_cmd.h" #include "timeout.h" #include "poll.h" @@ -174,13 +175,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) -{ - if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) - __io_submit_flush_completions(ctx); -} - static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { return ctx->cached_cq_tail - READ_ONCE(ctx->rings->cq.head); @@ -3241,37 +3235,6 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) -{ - struct hlist_node *tmp; - struct io_kiocb *req; - bool ret = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, - hash_node) { - struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, - struct io_uring_cmd); - struct file *file = req->file; - - if (!cancel_all && req->task != task) - continue; - - if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); - ret = true; - } - } - io_submit_flush_completions(ctx); - - return ret; -} - static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 6426ee382276..935d8d0747dc 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -154,6 +154,13 @@ static inline void io_req_task_work_add(struct io_kiocb *req) __io_req_task_work_add(req, 0); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs) || + ctx->submit_state.cqes_count) + __io_submit_flush_completions(ctx); +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 42f63adfa54a..1551848a9394 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -14,6 +14,36 @@ #include "rsrc.h" #include "uring_cmd.h" +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool ret = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, &ctx->cancelable_uring_cmd, + hash_node) { + struct io_uring_cmd *cmd = io_kiocb_to_cmd(req, + struct io_uring_cmd); + struct file *file = req->file; + + if (!cancel_all && req->task != task) + continue; + + if (cmd->flags & IORING_URING_CMD_CANCELABLE) { + /* ->sqe isn't available if no async data */ + if (!req_has_async_data(req)) + cmd->sqe = NULL; + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); + ret = true; + } + } + io_submit_flush_completions(ctx); + return ret; +} + static void io_uring_cmd_del_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 8117684ec3ca..7356bf9aa655 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -3,3 +3,6 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_uring_cmd_prep_async(struct io_kiocb *req); + +bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, + struct task_struct *task, bool cancel_all); \ No newline at end of file -- cgit v1.2.3 From 6edd953b6ec758c98e9dba7234634831f1f6510d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:24 +0000 Subject: io_uring/cmd: kill one issue_flags to tw conversion io_uring cmd converts struct io_tw_state to issue_flags and later back to io_tw_state, it's awfully ill-fated, not to mention that intermediate issue_flags state is not correct. Get rid of the last conversion, drag through tw everything that came with IO_URING_F_UNLOCKED, and replace io_req_complete_defer() with a direct call to io_req_complete_defer(), at least for the time being. Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/c53fa3df749752bd058cf6f824a90704822d6bcc.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 1551848a9394..7c1c58c5837e 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -130,11 +130,11 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); + } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { + io_req_complete_defer(req); } else { - struct io_tw_state ts = { - .locked = !(issue_flags & IO_URING_F_UNLOCKED), - }; - io_req_task_complete(req, &ts); + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -- cgit v1.2.3 From e1eef2e56cb0db143c731b1cdc220980256d2d99 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:25 +0000 Subject: io_uring/cmd: fix tw <-> issue_flags conversion !IO_URING_F_UNLOCKED does not translate to availability of the deferred completion infra, IO_URING_F_COMPLETE_DEFER does, that what we should pass and look for to use io_req_complete_defer() and other variants. Luckily, it's not a real problem as two wrongs actually made it right, at least as far as io_uring_cmd_work() goes. Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/aef76d34fe9410df8ecc42a14544fd76cd9d8b9e.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 7c1c58c5837e..759f919b14a9 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -36,7 +36,8 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, /* ->sqe isn't available if no async data */ if (!req_has_async_data(req)) cmd->sqe = NULL; - file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL); + file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | + IO_URING_F_COMPLETE_DEFER); ret = true; } } @@ -86,7 +87,11 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = IO_URING_F_UNLOCKED; + + /* locked task_work executor checks the deffered list completion */ + if (ts->locked) + issue_flags = IO_URING_F_COMPLETE_DEFER; ioucmd->task_work_cb(ioucmd, issue_flags); } @@ -130,7 +135,9 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { + } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { + if (WARN_ON_ONCE(issue_flags & IO_URING_F_UNLOCKED)) + return; io_req_complete_defer(req); } else { req->io_task_work.func = io_req_task_complete; -- cgit v1.2.3 From 36a005b9c66eade68f4235e612d733510a8d52ff Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:26 +0000 Subject: io_uring/cmd: document some uring_cmd related helpers Add comments warning users that they're only allowed to pass issue_flags that were given from io_uring. Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Tested-by: Ming Lei Link: https://lore.kernel.org/r/82ff8a45f2c3eb5f3a04a33f0692e5e4a1320455.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index e453a997c060..01b95505538a 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -26,12 +26,25 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); + +/* + * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd + * and the corresponding io_uring request. + * + * Note: the caller should never hard code @issue_flags and is only allowed + * to pass the mask provided by the core io_uring code. + */ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); + void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags); +/* + * Note: the caller should never hard code @issue_flags and only use the + * mask provided by the core io_uring code. + */ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); -- cgit v1.2.3 From 1afdb76038e27a3a4dd4cf522f6457868051db84 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Mar 2024 17:10:50 -0600 Subject: nvme/io_uring: use helper for polled completions NVMe is making up issue_flags, which is a no-no in general, and to make matters worse, they are completely the wrong ones. For a pure polled request, which it does check for, we're already inside the ctx->uring_lock when the completions are run off io_do_iopoll(). Hence the correct flag would be '0' rather than IO_URING_F_UNLOCKED. Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 15 +++++++++++---- include/linux/io_uring/cmd.h | 11 +++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index 3dfd5ae99ae0..499a8bb7cac7 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -423,13 +423,20 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, pdu->result = le64_to_cpu(nvme_req(req)->result.u64); /* - * For iopoll, complete it directly. + * For iopoll, complete it directly. Note that using the uring_cmd + * helper for this is safe only because we check blk_rq_is_poll(). + * As that returns false if we're NOT on a polled queue, then it's + * safe to use the polled completion helper. + * * Otherwise, move the completion to task work. */ - if (blk_rq_is_poll(req)) - nvme_uring_task_cb(ioucmd, IO_URING_F_UNLOCKED); - else + if (blk_rq_is_poll(req)) { + if (pdu->bio) + blk_rq_unmap_user(pdu->bio); + io_uring_cmd_iopoll_done(ioucmd, pdu->result, pdu->status); + } else { io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb); + } return RQ_END_IO_FREE; } diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 01b95505538a..447fbfd32215 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -69,6 +69,17 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } #endif +/* + * Polled completions must ensure they are coming from a poll queue, and + * hence are completed inside the usual poll handling loops. + */ +static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd, + ssize_t ret, ssize_t res2) +{ + lockdep_assert(in_task()); + io_uring_cmd_done(ioucmd, ret, res2, 0); +} + /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -- cgit v1.2.3 From 6e6b8c62120a22acd8cb759304e4cd2e3215d488 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:28 +0000 Subject: io_uring/rw: avoid punting to io-wq directly kiocb_done() should care to specifically redirecting requests to io-wq. Remove the hopping to tw to then queue an io-wq, return -EAGAIN and let the core code io_uring handle offloading. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/413564e550fe23744a970e1783dfa566291b0e6f.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 8 ++++---- io_uring/io_uring.h | 1 - io_uring/rw.c | 8 +------- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d5ac91b5b171..e9da2ad67c2e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -492,7 +492,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) +static void io_queue_iowq(struct io_kiocb *req) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1499,7 +1499,7 @@ void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, ts); + io_queue_iowq(req); else io_queue_sqe(req); } @@ -2087,7 +2087,7 @@ static void io_queue_async(struct io_kiocb *req, int ret) break; case IO_APOLL_ABORTED: io_kbuf_recycle(req, 0); - io_queue_iowq(req, NULL); + io_queue_iowq(req); break; case IO_APOLL_OK: break; @@ -2134,7 +2134,7 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) if (unlikely(req->ctx->drain_active)) io_drain_req(req); else - io_queue_iowq(req, NULL); + io_queue_iowq(req); } } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 935d8d0747dc..0861d49e83de 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -79,7 +79,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index c8d48287439e..0afe4f9e0e3f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -187,12 +187,6 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -static void io_req_task_queue_reissue(struct io_kiocb *req) -{ - req->io_task_work.func = io_queue_iowq; - io_req_task_work_add(req); -} - #ifdef CONFIG_BLOCK static bool io_resubmit_prep(struct io_kiocb *req) { @@ -405,7 +399,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) - io_req_task_queue_reissue(req); + return -EAGAIN; else io_req_task_queue_fail(req, final_ret); } -- cgit v1.2.3 From 92219afb980e01d88a1ac6d8fe01dcae255c4279 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:29 +0000 Subject: io_uring: force tw ctx locking We can run normal task_work without locking the ctx, however we try to lock anyway and most handlers prefer or require it locked. It might have been interesting to multi-submitter ring with high contention completing async read/write requests via task_work, however that will still need to go through io_req_complete_post() and potentially take the lock for rsrc node putting or some other case. In other words, it's hard to care about it, so alawys force the locking. The case described would also because of various io_uring caches. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/6ae858f2ef562e6ed9f13c60978c0d48926954ba.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e9da2ad67c2e..13fb5958454f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1185,8 +1185,9 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, if (req->ctx != ctx) { ctx_flush_and_put(ctx, &ts); ctx = req->ctx; - /* if not contended, grab and improve batching */ - ts.locked = mutex_trylock(&ctx->uring_lock); + + ts.locked = true; + mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, @@ -1447,11 +1448,9 @@ again: if (io_run_local_work_continue(ctx, ret, min_events)) goto again; - if (ts->locked) { - io_submit_flush_completions(ctx); - if (io_run_local_work_continue(ctx, ret, min_events)) - goto again; - } + io_submit_flush_completions(ctx); + if (io_run_local_work_continue(ctx, ret, min_events)) + goto again; trace_io_uring_local_work_run(ctx, ret, loops); return ret; @@ -1475,14 +1474,12 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) { - struct io_tw_state ts = {}; + struct io_tw_state ts = { .locked = true }; int ret; - ts.locked = mutex_trylock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); ret = __io_run_local_work(ctx, &ts, min_events); - if (ts.locked) - mutex_unlock(&ctx->uring_lock); - + mutex_unlock(&ctx->uring_lock); return ret; } -- cgit v1.2.3 From 8e5b3b89ecaf6d9295e561c225b35c574a5e0fe7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:30 +0000 Subject: io_uring: remove struct io_tw_state::locked ctx is always locked for task_work now, so get rid of struct io_tw_state::locked. Note I'm stopping one step before removing io_tw_state altogether, which is not empty, because it still serves the purpose of indicating which function is a tw callback and forcing users not to invoke them carelessly out of a wrong context. The removal can always be done later. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/e95e1ea116d0bfa54b656076e6a977bc221392a4.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 -- io_uring/io_uring.c | 31 ++++++++----------------------- io_uring/io_uring.h | 5 +---- io_uring/poll.c | 2 +- io_uring/rw.c | 6 ++---- io_uring/timeout.c | 8 ++------ io_uring/uring_cmd.c | 8 ++------ io_uring/waitid.c | 2 +- 8 files changed, 17 insertions(+), 47 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ac333ea81d31..90e5af401800 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -438,8 +438,6 @@ struct io_ring_ctx { }; struct io_tw_state { - /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ - bool locked; }; enum { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 13fb5958454f..ecb59bb2418b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -247,14 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - struct io_tw_state ts = { .locked = true, }; + struct io_tw_state ts = {}; percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) req->io_task_work.func(req, &ts); - if (WARN_ON_ONCE(!ts.locked)) - return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -1157,11 +1155,9 @@ static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ts->locked) { - io_submit_flush_completions(ctx); - mutex_unlock(&ctx->uring_lock); - ts->locked = false; - } + + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -1185,8 +1181,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, if (req->ctx != ctx) { ctx_flush_and_put(ctx, &ts); ctx = req->ctx; - - ts.locked = true; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } @@ -1459,22 +1453,16 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, int min_events) { - struct io_tw_state ts = { .locked = true, }; - int ret; + struct io_tw_state ts = {}; if (llist_empty(&ctx->work_llist)) return 0; - - ret = __io_run_local_work(ctx, &ts, min_events); - /* shouldn't happen! */ - if (WARN_ON_ONCE(!ts.locked)) - mutex_lock(&ctx->uring_lock); - return ret; + return __io_run_local_work(ctx, &ts, min_events); } static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) { - struct io_tw_state ts = { .locked = true }; + struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); @@ -1702,10 +1690,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (ts->locked) - io_req_complete_defer(req); - else - io_req_complete_post(req, IO_URING_F_UNLOCKED); + io_req_complete_defer(req); } /* diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 0861d49e83de..7f921daae9c3 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -351,10 +351,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!ts->locked) { - mutex_lock(&ctx->uring_lock); - ts->locked = true; - } + lockdep_assert_held(&ctx->uring_lock); } /* diff --git a/io_uring/poll.c b/io_uring/poll.c index 6db1dcb2c797..8901dd118e50 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -322,7 +322,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_fill_cqe_req_aux(req, ts->locked, mask, + if (!io_fill_cqe_req_aux(req, true, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; diff --git a/io_uring/rw.c b/io_uring/rw.c index 0afe4f9e0e3f..674581bda8d7 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -305,11 +305,9 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) io_req_io_end(req); - if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; + if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) + req->cqe.flags |= io_put_kbuf(req, 0); - req->cqe.flags |= io_put_kbuf(req, issue_flags); - } io_req_task_complete(req, ts); } diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 7fd7dbb211d6..0a48e6acd0b2 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,10 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { - bool filled; - filled = io_fill_cqe_req_aux(req, ts->locked, -ETIME, - IORING_CQE_F_MORE); - if (filled) { + if (io_fill_cqe_req_aux(req, true, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); @@ -301,7 +298,6 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -313,7 +309,7 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); + ret = io_try_cancel(req->task->io_uring, &cd, 0); } io_req_set_res(req, ret ?: -ETIME, 0); io_req_task_complete(req, ts); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 759f919b14a9..4614ce734fee 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -87,13 +87,9 @@ EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = IO_URING_F_UNLOCKED; - /* locked task_work executor checks the deffered list completion */ - if (ts->locked) - issue_flags = IO_URING_F_COMPLETE_DEFER; - - ioucmd->task_work_cb(ioucmd, issue_flags); + /* task_work executor checks the deffered list completion */ + ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 77d340666cb9..6362ec20abc0 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -118,7 +118,7 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = { .locked = true }; + struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); -- cgit v1.2.3 From e5c12945be5016d681ff305ea7306fef5902219d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:31 +0000 Subject: io_uring: refactor io_fill_cqe_req_aux The restriction on multishot execution context disallowing io-wq is driven by rules of io_fill_cqe_req_aux(), it should only be called in the master task context, either from the syscall path or in task_work. Since task_work now always takes the ctx lock implying IO_URING_F_COMPLETE_DEFER, we can just assume that the function is always called with its defer argument set to true. Kill the argument. Also rename the function for more consistency as "fill" in CQE related functions was usually meant for raw interfaces only copying data into the CQ without any locking, waking the user and other accounting "post" functions take care of. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/93423d106c33116c7d06bf277f651aa68b427328.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 16 +++------------- io_uring/io_uring.h | 2 +- io_uring/net.c | 6 ++---- io_uring/poll.c | 3 +-- io_uring/rw.c | 4 +--- io_uring/timeout.c | 2 +- 6 files changed, 9 insertions(+), 24 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ecb59bb2418b..9ae5e7db7c6a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -907,40 +907,30 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) state->cqes_count = 0; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled && allow_overflow) + if (!filled) filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); io_cq_unlock_post(ctx); return filled; } -bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) -{ - return __io_post_aux_cqe(ctx, user_data, res, cflags, true); -} - /* * A helper for multishot requests posting additional CQEs. * Should only be used from a task_work including IO_URING_F_MULTISHOT. */ -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags) +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; u64 user_data = req->cqe.user_data; struct io_uring_cqe *cqe; lockdep_assert(!io_wq_current_is_worker()); - - if (!defer) - return __io_post_aux_cqe(ctx, user_data, res, cflags, false); - lockdep_assert_held(&ctx->uring_lock); if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7f921daae9c3..460290e1bdec 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -67,7 +67,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -bool io_fill_cqe_req_aux(struct io_kiocb *req, bool defer, s32 res, u32 cflags); +bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); diff --git a/io_uring/net.c b/io_uring/net.c index 4afb475d4197..48bf5ca1a671 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -706,8 +706,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && - io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - *ret, cflags | IORING_CQE_F_MORE)) { + io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; @@ -1429,8 +1428,7 @@ retry: if (ret < 0) return ret; - if (io_fill_cqe_req_aux(req, issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, IORING_CQE_F_MORE)) + if (io_req_post_cqe(req, ret, IORING_CQE_F_MORE)) goto retry; io_req_set_res(req, ret, 0); diff --git a/io_uring/poll.c b/io_uring/poll.c index 8901dd118e50..5d55bbf1de15 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -322,8 +322,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_fill_cqe_req_aux(req, true, mask, - IORING_CQE_F_MORE)) { + if (!io_req_post_cqe(req, mask, IORING_CQE_F_MORE)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } diff --git a/io_uring/rw.c b/io_uring/rw.c index 674581bda8d7..b5165d1c9686 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -962,9 +962,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) cflags = io_put_kbuf(req, issue_flags); rw->len = 0; /* similarly to above, reset len to 0 */ - if (io_fill_cqe_req_aux(req, - issue_flags & IO_URING_F_COMPLETE_DEFER, - ret, cflags | IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (issue_flags & IO_URING_F_MULTISHOT) { /* * Force retry, as we might have more data to diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 0a48e6acd0b2..3458ca550b83 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -72,7 +72,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; if (!io_timeout_finish(timeout, data)) { - if (io_fill_cqe_req_aux(req, true, -ETIME, IORING_CQE_F_MORE)) { + if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); -- cgit v1.2.3 From 902ce82c2aa130bea5e3feca2d4ae62781865da7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:32 +0000 Subject: io_uring: get rid of intermediate aux cqe caches io_post_aux_cqe(), which is used for multishot requests, delays completions by putting CQEs into a temporary array for the purpose completion lock/flush batching. DEFER_TASKRUN doesn't need any locking, so for it we can put completions directly into the CQ and defer post completion handling with a flag. That leaves !DEFER_TASKRUN, which is not that interesting / hot for multishot requests, so have conditional locking with deferred flush for them. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/b1d05a81fd27aaa2a07f9860af13059e7ad7a890.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +- io_uring/io_uring.c | 62 +++++++++--------------------------------- io_uring/io_uring.h | 2 +- 3 files changed, 15 insertions(+), 52 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 90e5af401800..75b46119d4c8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -205,6 +205,7 @@ struct io_submit_state { bool plug_started; bool need_plug; + bool cq_flush; unsigned short submit_nr; unsigned int cqes_count; struct blk_plug plug; @@ -341,8 +342,6 @@ struct io_ring_ctx { unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; - struct io_uring_cqe completion_cqes[16]; - spinlock_t completion_lock; /* IRQ completion list, under ->completion_lock */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9ae5e7db7c6a..77e26b037fdd 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -630,6 +630,12 @@ static inline void __io_cq_lock(struct io_ring_ctx *ctx) spin_lock(&ctx->completion_lock); } +static inline void __io_cq_unlock(struct io_ring_ctx *ctx) +{ + if (!ctx->lockless_cq) + spin_unlock(&ctx->completion_lock); +} + static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { @@ -882,31 +888,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } -static void __io_flush_post_cqes(struct io_ring_ctx *ctx) - __must_hold(&ctx->uring_lock) -{ - struct io_submit_state *state = &ctx->submit_state; - unsigned int i; - - lockdep_assert_held(&ctx->uring_lock); - for (i = 0; i < state->cqes_count; i++) { - struct io_uring_cqe *cqe = &ctx->completion_cqes[i]; - - if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - spin_unlock(&ctx->completion_lock); - } else { - io_cqring_event_overflow(ctx, cqe->user_data, - cqe->res, cqe->flags, 0, 0); - } - } - } - state->cqes_count = 0; -} - bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; @@ -927,31 +908,16 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) { struct io_ring_ctx *ctx = req->ctx; - u64 user_data = req->cqe.user_data; - struct io_uring_cqe *cqe; + bool posted; lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->completion_cqes)) { - __io_cq_lock(ctx); - __io_flush_post_cqes(ctx); - /* no need to flush - flush is deferred */ - __io_cq_unlock_post(ctx); - } - - /* For defered completions this is not as strict as it is otherwise, - * however it's main job is to prevent unbounded posted completions, - * and in that it works just as well. - */ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - return false; - - cqe = &ctx->completion_cqes[ctx->submit_state.cqes_count++]; - cqe->user_data = user_data; - cqe->res = res; - cqe->flags = cflags; - return true; + __io_cq_lock(ctx); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + ctx->submit_state.cq_flush = true; + __io_cq_unlock_post(ctx); + return posted; } static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) @@ -1545,9 +1511,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node; __io_cq_lock(ctx); - /* must come first to preserve CQE ordering in failure cases */ - if (state->cqes_count) - __io_flush_post_cqes(ctx); __wq_list_for_each(node, &state->compl_reqs) { struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); @@ -1569,6 +1532,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + ctx->submit_state.cq_flush = false; } static unsigned io_cqring_events(struct io_ring_ctx *ctx) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 460290e1bdec..5119265a11c2 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -156,7 +156,7 @@ static inline void io_req_task_work_add(struct io_kiocb *req) static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || - ctx->submit_state.cqes_count) + ctx->submit_state.cq_flush) __io_submit_flush_completions(ctx); } -- cgit v1.2.3 From 23fbdde6205d9351bb52a4b8f11ec38bdbc8561a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:33 +0000 Subject: io_uring: remove current check from complete_post task_work execution is now always locked, and we shouldn't get into io_req_complete_post() from them. That means that complete_post() is always called out of the original task context and we don't even need to check current. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/24ec27f27db0d8f58c974d8118dca1d345314ddc.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 77e26b037fdd..a0073625ff16 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -972,7 +972,7 @@ void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - if (ctx->task_complete && ctx->submitter_task != current) { + if (ctx->task_complete) { req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); } else if (!(issue_flags & IO_URING_F_UNLOCKED) || -- cgit v1.2.3 From 0667db14e1f029d56243aa2509ebc5f944388200 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:34 +0000 Subject: io_uring: refactor io_req_complete_post() Make io_req_complete_post() to push all IORING_SETUP_IOPOLL requests to task_work, it's much cleaner and should normally happen. We couldn't do it before because there was a possibility of looping in complete_post() -> tw -> complete_post() -> ... Also, unexport the function and inline __io_req_complete_post(). Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/ea19c032ace3e0dd96ac4d991a063b0188037014.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 29 +++++++++++------------------ io_uring/io_uring.h | 1 - 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a0073625ff16..951ff3b787ab 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -920,11 +920,21 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) return posted; } -static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) +static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_rsrc_node *rsrc_node = NULL; + /* + * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires + * the submitter task context, IOPOLL protects with uring_lock. + */ + if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return; + } + io_cq_lock(ctx); if (!(req->flags & REQ_F_CQE_SKIP)) { if (!io_fill_cqe_req(ctx, req)) @@ -968,23 +978,6 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) } } -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (ctx->task_complete) { - req->io_task_work.func = io_req_task_complete; - io_req_task_work_add(req); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(ctx->flags & IORING_SETUP_IOPOLL)) { - __io_req_complete_post(req, issue_flags); - } else { - mutex_lock(&ctx->uring_lock); - __io_req_complete_post(req, issue_flags & ~IO_URING_F_UNLOCKED); - mutex_unlock(&ctx->uring_lock); - } -} - void io_req_defer_failed(struct io_kiocb *req, s32 res) __must_hold(&ctx->uring_lock) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 5119265a11c2..f694e7e6fb25 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -65,7 +65,6 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); -void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -- cgit v1.2.3 From c133b3b06b0653036b0c07675c1db0c89467ccdb Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 18 Mar 2024 22:00:35 +0000 Subject: io_uring: clean up io_lockdep_assert_cq_locked Move CONFIG_PROVE_LOCKING checks inside of io_lockdep_assert_cq_locked() and kill the else branch. Signed-off-by: Pavel Begunkov Tested-by: Ming Lei Link: https://lore.kernel.org/r/bbf33c429c9f6d7207a8fe66d1a5866ec2c99850.1710799188.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index f694e7e6fb25..bae8c1e937c1 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -119,9 +119,9 @@ enum { void io_eventfd_ops(struct rcu_head *rcu); void io_activate_pollwq(struct io_ring_ctx *ctx); -#if defined(CONFIG_PROVE_LOCKING) static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) { +#if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); if (ctx->flags & IORING_SETUP_IOPOLL) { @@ -140,12 +140,8 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) else lockdep_assert(current == ctx->submitter_task); } -} -#else -static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) -{ -} #endif +} static inline void io_req_task_work_add(struct io_kiocb *req) { -- cgit v1.2.3 From 254176234222c97c5da7fd33ff8c61d06480c228 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 10:41:25 -0600 Subject: io_uring: flush delayed fallback task_work in cancelation Just like we run the inline task_work, ensure we also factor in and run the fallback task_work. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 951ff3b787ab..805adebe19be 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3218,6 +3218,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, ret |= io_kill_timeouts(ctx, task, cancel_all); if (task) ret |= io_run_task_work() > 0; + else + ret |= flush_delayed_work(&ctx->fallback_work); return ret; } -- cgit v1.2.3 From 29f858a7c6e06060bbadee6d8502df36eed888bf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 11:10:21 -0600 Subject: io_uring: remove timeout/poll specific cancelations For historical reasons these were special cased, as they were the only ones that needed cancelation. But now we handle cancelations generally, and hence there's no need to check for these in io_ring_ctx_wait_and_kill() when io_uring_try_cancel_requests() handles both these and the rest as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 805adebe19be..e6e7794d497f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3068,17 +3068,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); - if (ctx->rings) - io_poll_remove_all(ctx, NULL, true); mutex_unlock(&ctx->uring_lock); - /* - * If we failed setting up the ctx, we might not have any rings - * and therefore did not submit any requests - */ - if (ctx->rings) - io_kill_timeouts(ctx, NULL, true); - flush_delayed_work(&ctx->fallback_work); INIT_WORK(&ctx->exit_work, io_ring_exit_work); -- cgit v1.2.3 From 0ae9b9a14d54bd0aa68c1e8bda9dd8e6346f1d87 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 18:23:44 -0600 Subject: io_uring/alloc_cache: shrink default max entries from 512 to 128 In practice, we just need to recycle a few elements for (by far) most use cases. Shrink the total size down from 512 to 128, which should be more than plenty. Signed-off-by: Jens Axboe --- io_uring/alloc_cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index bf2fb26a6539..138ad14b0b12 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -4,7 +4,7 @@ /* * Don't allow the cache to grow beyond this size. */ -#define IO_ALLOC_CACHE_MAX 512 +#define IO_ALLOC_CACHE_MAX 128 struct io_cache_entry { struct io_wq_work_node node; -- cgit v1.2.3 From 54cdcca05abde32acc3233950ddc79d8be25515f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Mar 2024 09:34:21 -0700 Subject: io_uring/net: switch io_send() and io_send_zc() to using io_async_msghdr No functional changes in this patch, just in preparation for carrying more state then what is being done now, if necessary. While unifying some of this code, add a generic send setup prep handler that they can both use. This gets rid of some manual msghdr and sockaddr on the stack, and makes it look a bit more like the sendmsg/recvmsg variants. Going forward, more can get unified on top. Signed-off-by: Jens Axboe --- io_uring/net.c | 194 ++++++++++++++++++++++++++++--------------------------- io_uring/opdef.c | 1 + 2 files changed, 101 insertions(+), 94 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 48bf5ca1a671..240a77976fda 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -322,36 +322,25 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, int io_send_prep_async(struct io_kiocb *req) { - struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *io; int ret; if (req_has_async_data(req)) return 0; - zc->done_io = 0; - if (!zc->addr) + sr->done_io = 0; + if (!sr->addr) return 0; io = io_msg_alloc_async_prep(req); if (!io) return -ENOMEM; - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &io->addr); - return ret; -} - -static int io_setup_async_addr(struct io_kiocb *req, - struct sockaddr_storage *addr_storage, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; - - if (!sr->addr || req_has_async_data(req)) - return -EAGAIN; - io = io_msg_alloc_async(req, issue_flags); - if (!io) - return -ENOMEM; - memcpy(&io->addr, addr_storage, sizeof(io->addr)); - return -EAGAIN; + memset(&io->msg, 0, sizeof(io->msg)); + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &io->msg.msg_iter); + if (unlikely(ret)) + return ret; + io->msg.msg_name = &io->addr; + io->msg.msg_namelen = sr->addr_len; + return move_addr_to_kernel(sr->addr, sr->addr_len, &io->addr); } int io_sendmsg_prep_async(struct io_kiocb *req) @@ -475,45 +464,66 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_send(struct io_kiocb *req, unsigned int issue_flags) +static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, + struct io_async_msghdr *stack_msg, + unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; - struct socket *sock; - unsigned flags; - int min_ret = 0; + struct io_async_msghdr *kmsg; int ret; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - msg.msg_ubuf = NULL; - - if (sr->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &__address); + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + kmsg = stack_msg; + kmsg->free_iov = NULL; + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_ubuf = NULL; + + if (sr->addr) { + ret = move_addr_to_kernel(sr->addr, sr->addr_len, + &kmsg->addr); if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; + return ERR_PTR(ret); + kmsg->msg.msg_name = &kmsg->addr; + kmsg->msg.msg_namelen = sr->addr_len; } - msg.msg_namelen = sr->addr_len; + + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ERR_PTR(ret); } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + return ERR_PTR(io_setup_async_msg(req, kmsg, issue_flags)); + + return kmsg; +} + +int io_send(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr iomsg, *kmsg; + size_t len = sr->len; + struct socket *sock; + unsigned flags; + int min_ret = 0; + int ret; sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &msg.msg_iter); + kmsg = io_send_setup(req, &iomsg, issue_flags); + if (IS_ERR(kmsg)) + return PTR_ERR(kmsg); + + ret = import_ubuf(ITER_SOURCE, sr->buf, len, &kmsg->msg.msg_iter); if (unlikely(ret)) return ret; @@ -521,21 +531,21 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = flags; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = flags; + ret = sock_sendmsg(sock, &kmsg->msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return io_setup_async_msg(req, kmsg, issue_flags); if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -545,6 +555,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + io_req_msg_cleanup(req, kmsg, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } @@ -1158,11 +1169,35 @@ static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb, return ret; } +static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, + (u64)(uintptr_t)sr->buf, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter; + } else { + io_notif_set_extended(sr->notif); + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + ret = io_notif_account_mem(sr->notif, sr->len); + if (unlikely(ret)) + return ret; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + } + + return ret; +} + int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { - struct sockaddr_storage __address; struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1173,67 +1208,37 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - msg.msg_name = NULL; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_namelen = 0; - - if (zc->addr) { - if (req_has_async_data(req)) { - struct io_async_msghdr *io = req->async_data; - - msg.msg_name = &io->addr; - } else { - ret = move_addr_to_kernel(zc->addr, zc->addr_len, &__address); - if (unlikely(ret < 0)) - return ret; - msg.msg_name = (struct sockaddr *)&__address; - } - msg.msg_namelen = zc->addr_len; - } - - if (!(req->flags & REQ_F_POLLED) && - (zc->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_addr(req, &__address, issue_flags); + kmsg = io_send_setup(req, &iomsg, issue_flags); + if (IS_ERR(kmsg)) + return PTR_ERR(kmsg); - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter; - } else { - io_notif_set_extended(zc->notif); - ret = import_ubuf(ITER_SOURCE, zc->buf, zc->len, &msg.msg_iter); + if (!zc->done_io) { + ret = io_send_zc_import(req, kmsg); if (unlikely(ret)) return ret; - ret = io_notif_account_mem(zc->notif, zc->len); - if (unlikely(ret)) - return ret; - msg.sg_from_iter = io_sg_from_iter_iovec; } msg_flags = zc->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; - msg.msg_flags = msg_flags; - msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; - ret = sock_sendmsg(sock, &msg); + kmsg->msg.msg_flags = msg_flags; + kmsg->msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; + ret = sock_sendmsg(sock, &kmsg->msg); if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_addr(req, &__address, issue_flags); + return io_setup_async_msg(req, kmsg, issue_flags); - if (ret > 0 && io_net_retry(sock, msg.msg_flags)) { + if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_addr(req, &__address, issue_flags); + return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1251,6 +1256,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); + io_netmsg_recycle(req, issue_flags); req->flags &= ~REQ_F_NEED_CLEANUP; } io_req_set_res(req, ret, IORING_CQE_F_MORE); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 9c080aadc5a6..b0a990c6bbff 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -602,6 +602,7 @@ const struct io_cold_def io_cold_defs[] = { .name = "SEND", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, .prep_async = io_send_prep_async, #endif -- cgit v1.2.3 From 4a3223f7bfda14c532856152b12aace525cf8079 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Mar 2024 15:39:16 -0700 Subject: io_uring/net: switch io_recv() to using io_async_msghdr No functional changes in this patch, just in preparation for carrying more state than what is available now, if necessary. Signed-off-by: Jens Axboe --- io_uring/net.c | 75 +++++++++++++++++++++++++++++++++++--------------------- io_uring/net.h | 2 +- io_uring/opdef.c | 7 ++++-- 3 files changed, 53 insertions(+), 31 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 240a77976fda..2be4919f99c6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -320,7 +320,7 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, return ret; } -int io_send_prep_async(struct io_kiocb *req) +int io_sendrecv_prep_async(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *io; @@ -703,13 +703,13 @@ static inline void io_recv_prep_retry(struct io_kiocb *req) * again (for multishot). */ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, - struct msghdr *msg, bool mshot_finished, - unsigned issue_flags) + struct io_async_msghdr *kmsg, + bool mshot_finished, unsigned issue_flags) { unsigned int cflags; cflags = io_put_kbuf(req, issue_flags); - if (msg->msg_inq > 0) + if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; /* @@ -723,7 +723,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, io_recv_prep_retry(req); /* Known not-empty or unknown state, retry */ - if (cflags & IORING_CQE_F_SOCK_NONEMPTY || msg->msg_inq < 0) { + if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) return false; /* mshot retries exceeded, force a requeue */ @@ -924,7 +924,7 @@ retry_multishot: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &kmsg->msg, mshot_finished, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; if (mshot_finished) @@ -938,29 +938,42 @@ retry_multishot: int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct msghdr msg; + struct io_async_msghdr iomsg, *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; size_t len = sr->len; + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + kmsg = &iomsg; + kmsg->free_iov = NULL; + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_iocb = NULL; + kmsg->msg.msg_ubuf = NULL; + + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + } + if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return -EAGAIN; + return io_setup_async_msg(req, kmsg, issue_flags); sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_get_inq = 1; - msg.msg_controllen = 0; - msg.msg_iocb = NULL; - msg.msg_ubuf = NULL; - flags = sr->msg_flags; if (force_nonblock) flags |= MSG_DONTWAIT; @@ -974,22 +987,23 @@ retry_multishot: return -ENOBUFS; sr->buf = buf; sr->len = len; + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + goto out_free; } - ret = import_ubuf(ITER_DEST, sr->buf, len, &msg.msg_iter); - if (unlikely(ret)) - goto out_free; - - msg.msg_inq = -1; - msg.msg_flags = 0; + kmsg->msg.msg_inq = -1; + kmsg->msg.msg_flags = 0; if (flags & MSG_WAITALL) - min_ret = iov_iter_count(&msg.msg_iter); + min_ret = iov_iter_count(&kmsg->msg.msg_iter); - ret = sock_recvmsg(sock, &msg, flags); + ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + ret = io_setup_async_msg(req, kmsg, issue_flags); + if (ret == -EAGAIN && issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } @@ -1001,12 +1015,12 @@ retry_multishot: sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_setup_async_msg(req, kmsg, issue_flags); } if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); - } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { out_free: req_set_fail(req); } @@ -1018,9 +1032,14 @@ out_free: else io_kbuf_recycle(req, issue_flags); - if (!io_recv_finish(req, &ret, &msg, ret <= 0, issue_flags)) + if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) goto retry_multishot; + if (ret == -EAGAIN) + return io_setup_async_msg(req, kmsg, issue_flags); + else if (ret != IOU_OK && ret != IOU_STOP_MULTISHOT) + io_req_msg_cleanup(req, kmsg, issue_flags); + return ret; } diff --git a/io_uring/net.h b/io_uring/net.h index 191009979bcb..5c1230f1aaf9 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -40,7 +40,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); -int io_send_prep_async(struct io_kiocb *req); +int io_sendrecv_prep_async(struct io_kiocb *req); int io_recvmsg_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index b0a990c6bbff..77131826d603 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -604,13 +604,16 @@ const struct io_cold_def io_cold_defs[] = { .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_send_prep_async, + .prep_async = io_sendrecv_prep_async, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, + .prep_async = io_sendrecv_prep_async, #endif }, [IORING_OP_OPENAT2] = { @@ -687,7 +690,7 @@ const struct io_cold_def io_cold_defs[] = { .name = "SEND_ZC", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_send_prep_async, + .prep_async = io_sendrecv_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif -- cgit v1.2.3 From f5b00ab2221a26202da7d10542a98203075bfdf8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 09:38:08 -0600 Subject: io_uring/net: unify cleanup handling Now that recv/recvmsg both do the same cleanup, put it in the retry and finish handlers. Signed-off-by: Jens Axboe --- io_uring/net.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 2be4919f99c6..b4f78ede070b 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -686,10 +686,16 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static inline void io_recv_prep_retry(struct io_kiocb *req) +static inline void io_recv_prep_retry(struct io_kiocb *req, + struct io_async_msghdr *kmsg) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + if (kmsg->free_iov) { + kfree(kmsg->free_iov); + kmsg->free_iov = NULL; + } + req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; sr->len = 0; /* get from the provided buffer */ @@ -721,7 +727,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - io_recv_prep_retry(req); + io_recv_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) @@ -730,10 +736,9 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, sr->nr_multishot_loops = 0; mshot_retry_ret = IOU_REQUEUE; } - if (issue_flags & IO_URING_F_MULTISHOT) + *ret = io_setup_async_msg(req, kmsg, issue_flags); + if (*ret == -EAGAIN && issue_flags & IO_URING_F_MULTISHOT) *ret = mshot_retry_ret; - else - *ret = -EAGAIN; return true; } @@ -744,6 +749,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; + io_req_msg_cleanup(req, kmsg, issue_flags); return true; } @@ -927,11 +933,6 @@ retry_multishot: if (!io_recv_finish(req, &ret, kmsg, mshot_finished, issue_flags)) goto retry_multishot; - if (mshot_finished) - io_req_msg_cleanup(req, kmsg, issue_flags); - else if (ret == -EAGAIN) - return io_setup_async_msg(req, kmsg, issue_flags); - return ret; } @@ -1035,11 +1036,6 @@ out_free: if (!io_recv_finish(req, &ret, kmsg, ret <= 0, issue_flags)) goto retry_multishot; - if (ret == -EAGAIN) - return io_setup_async_msg(req, kmsg, issue_flags); - else if (ret != IOU_OK && ret != IOU_STOP_MULTISHOT) - io_req_msg_cleanup(req, kmsg, issue_flags); - return ret; } -- cgit v1.2.3 From 790b68b32a678b65b161861f83b2b782b6b9246b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 17:26:09 -0600 Subject: io_uring/net: always setup an io_async_msghdr Rather than use an on-stack one and then need to allocate and copy if async execution is required, always grab one upfront. This should be very cheap, and potentially even have cache hotness benefits for back-to-back send/recv requests. For any recv type of request, this is probably a good choice in general, as it's expected that no data is available initially. For send this is not necessarily the case, as space in the socket buffer is expected to be available. However, getting a cached io_async_msghdr is very cheap, and as it should be cache hot, probably the difference here is neglible, if any. A nice side benefit is that io_setup_async_msg can get killed completely, which has some nasty iovec manipulation code. Signed-off-by: Jens Axboe --- io_uring/net.c | 117 +++++++++++++++++++++++---------------------------------- 1 file changed, 47 insertions(+), 70 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b4f78ede070b..d095d3e48fcd 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -161,36 +161,6 @@ static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *r return io_msg_alloc_async(req, 0); } -static int io_setup_async_msg(struct io_kiocb *req, - struct io_async_msghdr *kmsg, - unsigned int issue_flags) -{ - struct io_async_msghdr *async_msg; - - if (req_has_async_data(req)) - return -EAGAIN; - async_msg = io_msg_alloc_async(req, issue_flags); - if (!async_msg) { - kfree(kmsg->free_iov); - return -ENOMEM; - } - req->flags |= REQ_F_NEED_CLEANUP; - memcpy(async_msg, kmsg, sizeof(*kmsg)); - if (async_msg->msg.msg_name) - async_msg->msg.msg_name = &async_msg->addr; - - if ((req->flags & REQ_F_BUFFER_SELECT) && !async_msg->msg.msg_iter.nr_segs) - return -EAGAIN; - - /* if were using fast_iov, set it to the new one */ - if (iter_is_iovec(&kmsg->msg.msg_iter) && !kmsg->free_iov) { - size_t fast_idx = iter_iov(&kmsg->msg.msg_iter) - kmsg->fast_iov; - async_msg->msg.msg_iter.__iov = &async_msg->fast_iov[fast_idx]; - } - - return -EAGAIN; -} - #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -409,7 +379,7 @@ static void io_req_msg_cleanup(struct io_kiocb *req, int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; struct socket *sock; unsigned flags; int min_ret = 0; @@ -423,15 +393,17 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg = req->async_data; kmsg->msg.msg_control_user = sr->msg_control; } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); + kmsg = io_msg_alloc_async(req, issue_flags); + if (unlikely(!kmsg)) + return -ENOMEM; + ret = io_sendmsg_copy_hdr(req, kmsg); if (ret) return ret; - kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -443,13 +415,13 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -465,7 +437,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) } static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, - struct io_async_msghdr *stack_msg, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -475,8 +446,9 @@ static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, if (req_has_async_data(req)) { kmsg = req->async_data; } else { - kmsg = stack_msg; - kmsg->free_iov = NULL; + kmsg = io_msg_alloc_async(req, issue_flags); + if (unlikely(!kmsg)) + return ERR_PTR(-ENOMEM); kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; kmsg->msg.msg_control = NULL; @@ -500,7 +472,7 @@ static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return ERR_PTR(io_setup_async_msg(req, kmsg, issue_flags)); + return ERR_PTR(-EAGAIN); return kmsg; } @@ -508,7 +480,7 @@ static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; size_t len = sr->len; struct socket *sock; unsigned flags; @@ -519,7 +491,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = io_send_setup(req, &iomsg, issue_flags); + kmsg = io_send_setup(req, issue_flags); if (IS_ERR(kmsg)) return PTR_ERR(kmsg); @@ -538,14 +510,14 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret = sock_sendmsg(sock, &kmsg->msg); if (ret < min_ret) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -736,9 +708,10 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, sr->nr_multishot_loops = 0; mshot_retry_ret = IOU_REQUEUE; } - *ret = io_setup_async_msg(req, kmsg, issue_flags); - if (*ret == -EAGAIN && issue_flags & IO_URING_F_MULTISHOT) + if (issue_flags & IO_URING_F_MULTISHOT) *ret = mshot_retry_ret; + else + *ret = -EAGAIN; return true; } @@ -840,7 +813,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -854,15 +827,17 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; } else { - ret = io_recvmsg_copy_hdr(req, &iomsg); + kmsg = io_msg_alloc_async(req, issue_flags); + if (unlikely(!kmsg)) + return -ENOMEM; + ret = io_recvmsg_copy_hdr(req, kmsg); if (ret) return ret; - kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags; if (force_nonblock) @@ -904,17 +879,16 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg, issue_flags); - if (ret == -EAGAIN && (issue_flags & IO_URING_F_MULTISHOT)) { + if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } - return ret; + return -EAGAIN; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -939,7 +913,7 @@ retry_multishot: int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -949,7 +923,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; } else { - kmsg = &iomsg; + kmsg = io_msg_alloc_async(req, issue_flags); + if (unlikely(!kmsg)) + return -ENOMEM; kmsg->free_iov = NULL; kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; @@ -969,7 +945,7 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; sock = sock_from_file(req->file); if (unlikely(!sock)) @@ -1003,8 +979,7 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - ret = io_setup_async_msg(req, kmsg, issue_flags); - if (ret == -EAGAIN && issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) { io_kbuf_recycle(req, issue_flags); return IOU_ISSUE_SKIP_COMPLETE; } @@ -1016,7 +991,7 @@ retry_multishot: sr->buf += ret; sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1212,7 +1187,7 @@ static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1223,7 +1198,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - kmsg = io_send_setup(req, &iomsg, issue_flags); + kmsg = io_send_setup(req, issue_flags); if (IS_ERR(kmsg)) return PTR_ERR(kmsg); @@ -1246,14 +1221,14 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, kmsg->msg.msg_flags)) { zc->len -= ret; zc->buf += ret; zc->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1281,7 +1256,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr iomsg, *kmsg; + struct io_async_msghdr *kmsg; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1298,15 +1273,17 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) kmsg = req->async_data; kmsg->msg.msg_control_user = sr->msg_control; } else { - ret = io_sendmsg_copy_hdr(req, &iomsg); + kmsg = io_msg_alloc_async(req, issue_flags); + if (unlikely(!kmsg)) + return -ENOMEM; + ret = io_sendmsg_copy_hdr(req, kmsg); if (ret) return ret; - kmsg = &iomsg; } if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; flags = sr->msg_flags | MSG_ZEROCOPY; if (issue_flags & IO_URING_F_NONBLOCK) @@ -1320,12 +1297,12 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret < min_ret)) { if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return io_setup_async_msg(req, kmsg, issue_flags); + return -EAGAIN; } if (ret == -ERESTARTSYS) ret = -EINTR; -- cgit v1.2.3 From 3ba8345aec886a3a01331e944a6a8568bf94bd10 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Apr 2024 12:39:54 -0600 Subject: io_uring/net: always set kmsg->msg.msg_control_user before issue We currently set this separately for async/sync entry, but let's just move it to a generic pre-issue spot and eliminate the difference between the two. Signed-off-by: Jens Axboe --- io_uring/net.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index d095d3e48fcd..b08c0ae5951a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -391,7 +391,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; } else { kmsg = io_msg_alloc_async(req, issue_flags); if (unlikely(!kmsg)) @@ -411,6 +410,8 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (ret < min_ret) { @@ -1271,7 +1272,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (req_has_async_data(req)) { kmsg = req->async_data; - kmsg->msg.msg_control_user = sr->msg_control; } else { kmsg = io_msg_alloc_async(req, issue_flags); if (unlikely(!kmsg)) @@ -1291,6 +1291,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (flags & MSG_WAITALL) min_ret = iov_iter_count(&kmsg->msg.msg_iter); + kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); -- cgit v1.2.3 From c6f32c7d9e09bf1368447e9a29e869193ecbb756 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 07:36:03 -0600 Subject: io_uring/net: get rid of ->prep_async() for receive side Move the io_async_msghdr out of the issue path and into prep handling, since it's now done unconditionally and hence does not need to be part of the issue path. This reduces the footprint of the multishot fast path of multiple invocations of ->issue() per prep, and also means that using ->prep_async() can be dropped for recvmsg asthis is now done via setup on the prep side. Signed-off-by: Jens Axboe --- io_uring/net.c | 71 ++++++++++++++++++++++---------------------------------- io_uring/net.h | 1 - io_uring/opdef.c | 2 -- 3 files changed, 28 insertions(+), 46 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index b08c0ae5951a..7cd93cd8b8c4 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -595,17 +595,36 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, msg.msg_controllen); } -int io_recvmsg_prep_async(struct io_kiocb *req) +static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *iomsg; + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + /* always locked for prep */ + kmsg = io_msg_alloc_async(req, 0); + if (unlikely(!kmsg)) return -ENOMEM; - iomsg = req->async_data; - ret = io_recvmsg_copy_hdr(req, iomsg); + + if (req->opcode == IORING_OP_RECV) { + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_get_inq = 1; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_iocb = NULL; + kmsg->msg.msg_ubuf = NULL; + + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + return 0; + } + + ret = io_recvmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; @@ -656,7 +675,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags |= MSG_CMSG_COMPAT; #endif sr->nr_multishot_loops = 0; - return 0; + return io_recvmsg_prep_setup(req); } static inline void io_recv_prep_retry(struct io_kiocb *req, @@ -814,7 +833,7 @@ static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io, int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -825,17 +844,6 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - kmsg = io_msg_alloc_async(req, issue_flags); - if (unlikely(!kmsg)) - return -ENOMEM; - ret = io_recvmsg_copy_hdr(req, kmsg); - if (ret) - return ret; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; @@ -914,36 +922,13 @@ retry_multishot: int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; size_t len = sr->len; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - kmsg = io_msg_alloc_async(req, issue_flags); - if (unlikely(!kmsg)) - return -ENOMEM; - kmsg->free_iov = NULL; - kmsg->msg.msg_name = NULL; - kmsg->msg.msg_namelen = 0; - kmsg->msg.msg_control = NULL; - kmsg->msg.msg_get_inq = 1; - kmsg->msg.msg_controllen = 0; - kmsg->msg.msg_iocb = NULL; - kmsg->msg.msg_ubuf = NULL; - - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; diff --git a/io_uring/net.h b/io_uring/net.h index 5c1230f1aaf9..4b4fd9b1b7b4 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -42,7 +42,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); int io_sendrecv_prep_async(struct io_kiocb *req); -int io_recvmsg_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); int io_recv(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 77131826d603..1368193edc57 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -536,7 +536,6 @@ const struct io_cold_def io_cold_defs[] = { .name = "RECVMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_recvmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -613,7 +612,6 @@ const struct io_cold_def io_cold_defs[] = { .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_sendrecv_prep_async, #endif }, [IORING_OP_OPENAT2] = { -- cgit v1.2.3 From 50220d6ac8ff31eb065fba818e960f549fb89d4d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 08:09:47 -0600 Subject: io_uring/net: get rid of ->prep_async() for send side Move the io_async_msghdr out of the issue path and into prep handling, e it's now done unconditionally and hence does not need to be part of the issue path. This means any usage of io_sendrecv_prep_async() and io_sendmsg_prep_async(), and hence the forced async setup path is now unified with the normal prep setup. Signed-off-by: Jens Axboe --- io_uring/net.c | 154 +++++++++++++++++-------------------------------------- io_uring/net.h | 2 - io_uring/opdef.c | 4 -- 3 files changed, 46 insertions(+), 114 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 7cd93cd8b8c4..a57b15e91ad9 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -290,50 +290,56 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, return ret; } -int io_sendrecv_prep_async(struct io_kiocb *req) +void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) +{ + struct io_async_msghdr *io = req->async_data; + + kfree(io->free_iov); +} + +static int io_send_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *kmsg = req->async_data; int ret; - if (req_has_async_data(req)) - return 0; - sr->done_io = 0; - if (!sr->addr) - return 0; - io = io_msg_alloc_async_prep(req); - if (!io) - return -ENOMEM; - memset(&io->msg, 0, sizeof(io->msg)); - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &io->msg.msg_iter); - if (unlikely(ret)) + kmsg->msg.msg_name = NULL; + kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_control = NULL; + kmsg->msg.msg_controllen = 0; + kmsg->msg.msg_ubuf = NULL; + + if (sr->addr) { + ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + if (unlikely(ret < 0)) + return ret; + kmsg->msg.msg_name = &kmsg->addr; + kmsg->msg.msg_namelen = sr->addr_len; + } + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); + if (unlikely(ret < 0)) return ret; - io->msg.msg_name = &io->addr; - io->msg.msg_namelen = sr->addr_len; - return move_addr_to_kernel(sr->addr, sr->addr_len, &io->addr); + + return 0; } -int io_sendmsg_prep_async(struct io_kiocb *req) +static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg; int ret; - sr->done_io = 0; - if (!io_msg_alloc_async_prep(req)) + /* always locked for prep */ + kmsg = io_msg_alloc_async(req, 0); + if (unlikely(!kmsg)) return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!is_msg) + return io_send_setup(req); + ret = io_sendmsg_copy_hdr(req, kmsg); if (!ret) req->flags |= REQ_F_NEED_CLEANUP; return ret; } -void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) -{ - struct io_async_msghdr *io = req->async_data; - - kfree(io->free_iov); -} - int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -362,7 +368,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); } static void io_req_msg_cleanup(struct io_kiocb *req, @@ -379,7 +385,7 @@ static void io_req_msg_cleanup(struct io_kiocb *req, int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; @@ -389,17 +395,6 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - kmsg = io_msg_alloc_async(req, issue_flags); - if (unlikely(!kmsg)) - return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, kmsg); - if (ret) - return ret; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; @@ -437,52 +432,10 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -static struct io_async_msghdr *io_send_setup(struct io_kiocb *req, - unsigned int issue_flags) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; - int ret; - - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - kmsg = io_msg_alloc_async(req, issue_flags); - if (unlikely(!kmsg)) - return ERR_PTR(-ENOMEM); - kmsg->msg.msg_name = NULL; - kmsg->msg.msg_namelen = 0; - kmsg->msg.msg_control = NULL; - kmsg->msg.msg_controllen = 0; - kmsg->msg.msg_ubuf = NULL; - - if (sr->addr) { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, - &kmsg->addr); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - kmsg->msg.msg_name = &kmsg->addr; - kmsg->msg.msg_namelen = sr->addr_len; - } - - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ERR_PTR(ret); - } - - if (!(req->flags & REQ_F_POLLED) && - (sr->flags & IORING_RECVSEND_POLL_FIRST)) - return ERR_PTR(-EAGAIN); - - return kmsg; -} - int io_send(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; - size_t len = sr->len; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int min_ret = 0; @@ -492,13 +445,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = io_send_setup(req, issue_flags); - if (IS_ERR(kmsg)) - return PTR_ERR(kmsg); - - ret = import_ubuf(ITER_SOURCE, sr->buf, len, &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) @@ -1084,7 +1033,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) zc->msg_flags |= MSG_CMSG_COMPAT; #endif - return 0; + return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); } static int io_sg_from_iter_iovec(struct sock *sk, struct sk_buff *skb, @@ -1173,7 +1122,7 @@ static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned msg_flags; int ret, min_ret = 0; @@ -1184,9 +1133,9 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - kmsg = io_send_setup(req, issue_flags); - if (IS_ERR(kmsg)) - return PTR_ERR(kmsg); + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; if (!zc->done_io) { ret = io_send_zc_import(req, kmsg); @@ -1242,7 +1191,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; unsigned flags; int ret, min_ret = 0; @@ -1255,17 +1204,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (!test_bit(SOCK_SUPPORT_ZC, &sock->flags)) return -EOPNOTSUPP; - if (req_has_async_data(req)) { - kmsg = req->async_data; - } else { - kmsg = io_msg_alloc_async(req, issue_flags); - if (unlikely(!kmsg)) - return -ENOMEM; - ret = io_sendmsg_copy_hdr(req, kmsg); - if (ret) - return ret; - } - if (!(req->flags & REQ_F_POLLED) && (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; diff --git a/io_uring/net.h b/io_uring/net.h index 4b4fd9b1b7b4..f99ebb9dc0bb 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -34,13 +34,11 @@ struct io_async_connect { int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); -int io_sendmsg_prep_async(struct io_kiocb *req); void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req); int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags); int io_send(struct io_kiocb *req, unsigned int issue_flags); -int io_sendrecv_prep_async(struct io_kiocb *req); int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 1368193edc57..dd4a1e1425e1 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -527,7 +527,6 @@ const struct io_cold_def io_cold_defs[] = { .name = "SENDMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -603,7 +602,6 @@ const struct io_cold_def io_cold_defs[] = { .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, - .prep_async = io_sendrecv_prep_async, #endif }, [IORING_OP_RECV] = { @@ -688,7 +686,6 @@ const struct io_cold_def io_cold_defs[] = { .name = "SEND_ZC", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendrecv_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -697,7 +694,6 @@ const struct io_cold_def io_cold_defs[] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), - .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif -- cgit v1.2.3 From 6498c5c97ce73770ed227eb52b14d21c8343fd5b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 10:07:37 -0600 Subject: io_uring: kill io_msg_alloc_async_prep() We now ONLY call io_msg_alloc_async() from inside prep handling, which is always locked. No need for this helper anymore, or the check in io_msg_alloc_async() on whether the ring is locked or not. Signed-off-by: Jens Axboe --- io_uring/net.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index a57b15e91ad9..77f2eb79e5e0 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -129,22 +129,19 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } } -static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, - unsigned int issue_flags) +static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_cache_entry *entry; struct io_async_msghdr *hdr; - if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->netmsg_cache); - if (entry) { - hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } + entry = io_alloc_cache_get(&ctx->netmsg_cache); + if (entry) { + hdr = container_of(entry, struct io_async_msghdr, cache); + hdr->free_iov = NULL; + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = hdr; + return hdr; } if (!io_alloc_async_data(req)) { @@ -155,12 +152,6 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req, return NULL; } -static inline struct io_async_msghdr *io_msg_alloc_async_prep(struct io_kiocb *req) -{ - /* ->prep_async is always called from the submission context */ - return io_msg_alloc_async(req, 0); -} - #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -328,8 +319,7 @@ static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) struct io_async_msghdr *kmsg; int ret; - /* always locked for prep */ - kmsg = io_msg_alloc_async(req, 0); + kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) return -ENOMEM; if (!is_msg) @@ -550,8 +540,7 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) struct io_async_msghdr *kmsg; int ret; - /* always locked for prep */ - kmsg = io_msg_alloc_async(req, 0); + kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) return -ENOMEM; -- cgit v1.2.3 From 9f8539fe299c250af42325eccff66e8b8d1f15da Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Mar 2024 19:09:50 -0600 Subject: io_uring/net: remove (now) dead code in io_netmsg_recycle() All net commands have async data at this point, there's no reason to check if this is the case or not. Signed-off-by: Jens Axboe --- io_uring/net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 77f2eb79e5e0..adc6d6e1cce6 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -119,7 +119,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; - if (!req_has_async_data(req) || issue_flags & IO_URING_F_UNLOCKED) + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; /* Let normal cleanup path reap it if we fail adding to the cache */ -- cgit v1.2.3 From 75191341785eef51f87ff54b0ed9dfbd5a72e7c2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 15:33:53 -0600 Subject: io_uring/net: add iovec recycling Right now the io_async_msghdr is recycled to avoid the overhead of allocating+freeing it for every request. But the iovec is not included, hence that will be allocated and freed for each transfer regardless. This commit enables recyling of the iovec between io_async_msghdr recycles. This avoids alloc+free for each one if an iovec is used, and on top of that, it extends the cache hot nature of msg to the iovec as well. Also enables KASAN for the iovec entries, so that reuse can be detected even while they are in the cache. The io_async_msghdr also shrinks from 376 -> 288 bytes, an 88 byte saving (or ~23% smaller), as the fast_iovec entry is dropped from 8 entries to a single entry. There's no point keeping a big fast iovec entry, if iovecs aren't being allocated and freed continually. Signed-off-by: Jens Axboe --- io_uring/net.c | 131 ++++++++++++++++++++++++++++++++++++--------------------- io_uring/net.h | 13 +++--- 2 files changed, 91 insertions(+), 53 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index adc6d6e1cce6..2727b67f6a72 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -115,15 +115,31 @@ static bool io_net_retry(struct socket *sock, int flags) return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; } +static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) +{ + if (kmsg->free_iov) { + kfree(kmsg->free_iov); + kmsg->free_iov_nr = 0; + kmsg->free_iov = NULL; + } +} + static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; + struct iovec *iov; - if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) + /* can't recycle, ensure we free the iovec if we have one */ + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_netmsg_iovec_free(hdr); return; + } /* Let normal cleanup path reap it if we fail adding to the cache */ + iov = hdr->free_iov; if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + if (iov) + kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } @@ -138,7 +154,11 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) entry = io_alloc_cache_get(&ctx->netmsg_cache); if (entry) { hdr = container_of(entry, struct io_async_msghdr, cache); - hdr->free_iov = NULL; + if (hdr->free_iov) { + kasan_mempool_unpoison_object(hdr->free_iov, + hdr->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; + } req->flags |= REQ_F_ASYNC_DATA; req->async_data = hdr; return hdr; @@ -146,12 +166,27 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) if (!io_alloc_async_data(req)) { hdr = req->async_data; + hdr->free_iov_nr = 0; hdr->free_iov = NULL; return hdr; } return NULL; } +/* assign new iovec to kmsg, if we need to */ +static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, + struct iovec *iov) +{ + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; + if (kmsg->free_iov) + kfree(kmsg->free_iov); + kmsg->free_iov = iov; + } + return 0; +} + #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -159,7 +194,16 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct compat_iovec __user *uiov; - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; @@ -168,9 +212,9 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, if (req->flags & REQ_F_BUFFER_SELECT) { compat_ssize_t clen; - iomsg->free_iov = NULL; if (msg->msg_iovlen == 0) { - sr->len = 0; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { return -EINVAL; } else { @@ -186,14 +230,12 @@ static int io_compat_msg_copy_hdr(struct io_kiocb *req, return 0; } - iomsg->free_iov = iomsg->fast_iov; ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - UIO_FASTIOV, &iomsg->free_iov, - &iomsg->msg.msg_iter, true); + nr_segs, &iov, &iomsg->msg.msg_iter, true); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } #endif @@ -201,7 +243,16 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, struct user_msghdr *msg, int ddir) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - int ret; + struct iovec *iov; + int ret, nr_segs; + + if (iomsg->free_iov) { + nr_segs = iomsg->free_iov_nr; + iov = iomsg->free_iov; + } else { + iov = &iomsg->fast_iov; + nr_segs = 1; + } if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) return -EFAULT; @@ -217,9 +268,8 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iomsg->fast_iov[0].iov_len = 0; - iomsg->fast_iov[0].iov_base = NULL; - iomsg->free_iov = NULL; + sr->len = iov->iov_len = 0; + iov->iov_base = NULL; } else if (msg->msg_iovlen > 1) { ret = -EINVAL; goto ua_end; @@ -227,10 +277,9 @@ static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, /* we only need the length for provided buffers */ if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) goto ua_end; - unsafe_get_user(iomsg->fast_iov[0].iov_len, - &msg->msg_iov[0].iov_len, ua_end); - sr->len = iomsg->fast_iov[0].iov_len; - iomsg->free_iov = NULL; + unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, + ua_end); + sr->len = iov->iov_len; } ret = 0; ua_end: @@ -239,13 +288,12 @@ ua_end: } user_access_end(); - iomsg->free_iov = iomsg->fast_iov; - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, UIO_FASTIOV, - &iomsg->free_iov, &iomsg->msg.msg_iter, false); + ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, + &iov, &iomsg->msg.msg_iter, false); if (unlikely(ret < 0)) return ret; - return 0; + return io_net_vec_assign(req, iomsg, iov); } static int io_sendmsg_copy_hdr(struct io_kiocb *req, @@ -285,7 +333,7 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) { struct io_async_msghdr *io = req->async_data; - kfree(io->free_iov); + io_netmsg_iovec_free(io); } static int io_send_setup(struct io_kiocb *req) @@ -366,9 +414,6 @@ static void io_req_msg_cleanup(struct io_kiocb *req, unsigned int issue_flags) { req->flags &= ~REQ_F_NEED_CLEANUP; - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) - kfree(kmsg->free_iov); io_netmsg_recycle(req, issue_flags); } @@ -621,11 +666,6 @@ static inline void io_recv_prep_retry(struct io_kiocb *req, { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov = NULL; - } - req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; sr->len = 0; /* get from the provided buffer */ @@ -941,14 +981,10 @@ out_free: void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); - struct io_async_msghdr *io; + struct io_async_msghdr *io = req->async_data; - if (req_has_async_data(req)) { - io = req->async_data; - /* might be ->fast_iov if *msg_copy_hdr failed */ - if (io->free_iov != io->fast_iov) - kfree(io->free_iov); - } + if (req_has_async_data(req)) + io_netmsg_iovec_free(io); if (zc->notif) { io_notif_flush(zc->notif); zc->notif = NULL; @@ -1170,8 +1206,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); - io_netmsg_recycle(req, issue_flags); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, kmsg, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1221,13 +1256,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; req_set_fail(req); } - /* fast path, check for non-NULL to avoid function call */ - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov = NULL; - } - io_netmsg_recycle(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -1239,7 +1268,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); - req->flags &= ~REQ_F_NEED_CLEANUP; + io_req_msg_cleanup(req, kmsg, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1483,6 +1512,14 @@ out: void io_netmsg_cache_free(struct io_cache_entry *entry) { - kfree(container_of(entry, struct io_async_msghdr, cache)); + struct io_async_msghdr *kmsg; + + kmsg = container_of(entry, struct io_async_msghdr, cache); + if (kmsg->free_iov) { + kasan_mempool_unpoison_object(kmsg->free_iov, + kmsg->free_iov_nr * sizeof(struct iovec)); + io_netmsg_iovec_free(kmsg); + } + kfree(kmsg); } #endif diff --git a/io_uring/net.h b/io_uring/net.h index f99ebb9dc0bb..0aef1c992aee 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -8,17 +8,18 @@ struct io_async_msghdr { #if defined(CONFIG_NET) union { - struct iovec fast_iov[UIO_FASTIOV]; + struct iovec fast_iov; struct { - struct iovec fast_iov_one; - __kernel_size_t controllen; - int namelen; - __kernel_size_t payloadlen; + struct io_cache_entry cache; + /* entry size of ->free_iov, if valid */ + int free_iov_nr; }; - struct io_cache_entry cache; }; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; + __kernel_size_t controllen; + __kernel_size_t payloadlen; + int namelen; struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; -- cgit v1.2.3 From d80f940701302e84d1398ecb103083468b566a69 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 13:52:42 -0600 Subject: io_uring/net: drop 'kmsg' parameter from io_req_msg_cleanup() Now that iovec recycling is being done, the iovec is no longer being freed in there. Hence the kmsg parameter is now useless. Signed-off-by: Jens Axboe --- io_uring/net.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 2727b67f6a72..1e1d77321fce 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -410,7 +410,6 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } static void io_req_msg_cleanup(struct io_kiocb *req, - struct io_async_msghdr *kmsg, unsigned int issue_flags) { req->flags &= ~REQ_F_NEED_CLEANUP; @@ -458,7 +457,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) ret = -EINTR; req_set_fail(req); } - io_req_msg_cleanup(req, kmsg, issue_flags); + io_req_msg_cleanup(req, issue_flags); if (ret >= 0) ret += sr->done_io; else if (sr->done_io) @@ -512,7 +511,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; - io_req_msg_cleanup(req, kmsg, issue_flags); + io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } @@ -720,7 +719,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, *ret = IOU_STOP_MULTISHOT; else *ret = IOU_OK; - io_req_msg_cleanup(req, kmsg, issue_flags); + io_req_msg_cleanup(req, issue_flags); return true; } @@ -1206,7 +1205,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); - io_req_msg_cleanup(req, kmsg, 0); + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; @@ -1268,7 +1267,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); - io_req_msg_cleanup(req, kmsg, 0); + io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); return IOU_OK; -- cgit v1.2.3 From a9165b83c1937eeed1f0c731468216d6371d647f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 16:13:01 -0600 Subject: io_uring/rw: always setup io_async_rw for read/write requests read/write requests try to put everything on the stack, and then alloc and copy if a retry is needed. This necessitates a bunch of nasty code that deals with intermediate state. Get rid of this, and have the prep side setup everything that is needed upfront, which greatly simplifies the opcode handlers. This includes adding an alloc cache for io_async_rw, to make it cheap to handle. In terms of cost, this should be basically free and transparent. For the worst case of {READ,WRITE}_FIXED which didn't need it before, performance is unaffected in the normal peak workload that is being used to test that. Still runs at 122M IOPS. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 3 + io_uring/opdef.c | 15 +- io_uring/rw.c | 538 +++++++++++++++++++---------------------- io_uring/rw.h | 19 +- 5 files changed, 278 insertions(+), 298 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 75b46119d4c8..60d7e35fc303 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -300,6 +300,7 @@ struct io_ring_ctx { struct io_hash_table cancel_table_locked; struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; + struct io_alloc_cache rw_cache; /* * Any cancelable uring_cmd is added to this list in diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e6e7794d497f..b0554d122931 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -311,6 +311,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); + io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_rw)); io_futex_cache_init(ctx); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); @@ -2823,6 +2825,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index dd4a1e1425e1..fcae75a08f2c 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,7 +67,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .prep = io_prep_readv, .issue = io_read, }, [IORING_OP_WRITEV] = { @@ -81,7 +81,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, - .prep = io_prep_rwv, + .prep = io_prep_writev, .issue = io_write, }, [IORING_OP_FSYNC] = { @@ -99,7 +99,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .prep = io_prep_read_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { @@ -112,7 +112,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw_fixed, + .prep = io_prep_write_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { @@ -239,7 +239,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .prep = io_prep_read, .issue = io_read, }, [IORING_OP_WRITE] = { @@ -252,7 +252,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, - .prep = io_prep_rw, + .prep = io_prep_write, .issue = io_write, }, [IORING_OP_FADVISE] = { @@ -490,14 +490,12 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_READV] = { .async_size = sizeof(struct io_async_rw), .name = "READV", - .prep_async = io_readv_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .async_size = sizeof(struct io_async_rw), .name = "WRITEV", - .prep_async = io_writev_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, @@ -699,6 +697,7 @@ const struct io_cold_def io_cold_defs[] = { #endif }, [IORING_OP_READ_MULTISHOT] = { + .async_size = sizeof(struct io_async_rw), .name = "READ_MULTISHOT", }, [IORING_OP_WAITID] = { diff --git a/io_uring/rw.c b/io_uring/rw.c index b5165d1c9686..2625e177c9bc 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -75,7 +75,153 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_import_iovec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + void __user *buf; + size_t sqe_len; + + buf = u64_to_user_ptr(rw->addr); + sqe_len = rw->len; + + if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + + return import_ubuf(ddir, buf, sqe_len, &io->s.iter); + } + + io->free_iovec = io->s.fast_iov; + return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec, + &io->s.iter, req->ctx->compat); +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + int ret; + + ret = __io_import_iovec(rw, req, io, issue_flags); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&io->s.iter, &io->s.iter_state); + return 0; +} + +static void io_rw_iovec_free(struct io_async_rw *rw) +{ + if (rw->free_iovec) { + kfree(rw->free_iovec); + rw->free_iovec = NULL; + } +} + +static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_async_rw *rw = req->async_data; + + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { + io_rw_iovec_free(rw); + return; + } + if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) { + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + +static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + /* + * Disable quick recycling for anything that's gone through io-wq. + * In theory, this should be fine to cleanup. However, some read or + * write iter handling touches the iovec AFTER having called into the + * handler, eg to reexpand or revert. This means we can have: + * + * task io-wq + * issue + * punt to io-wq + * issue + * blkdev_write_iter() + * ->ki_complete() + * io_complete_rw() + * queue tw complete + * run tw + * req_rw_cleanup + * iov_iter_count() <- look at iov_iter again + * + * which can lead to a UAF. This is only possible for io-wq offload + * as the cleanup can run in parallel. As io-wq is not the fast path, + * just leave cleanup to the end. + * + * This is really a bug in the core code that does this, any issue + * path should assume that a successful (or -EIOCBQUEUED) return can + * mean that the underlying data can be gone at any time. But that + * should be fixed seperately, and then this check could be killed. + */ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags &= ~REQ_F_NEED_CLEANUP; + io_rw_recycle(req, issue_flags); + } +} + +static int io_rw_alloc_async(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; + struct io_async_rw *rw; + + entry = io_alloc_cache_get(&ctx->rw_cache); + if (entry) { + rw = container_of(entry, struct io_async_rw, cache); + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = rw; + goto done; + } + + if (!io_alloc_async_data(req)) { + rw = req->async_data; +done: + rw->free_iovec = NULL; + rw->bytes_done = 0; + return 0; + } + + return -ENOMEM; +} + +static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) +{ + struct io_async_rw *rw; + int ret; + + if (io_rw_alloc_async(req)) + return -ENOMEM; + + if (!do_import || io_do_buffer_select(req)) + return 0; + + rw = req->async_data; + ret = io_import_iovec(ddir, req, rw, 0); + if (unlikely(ret < 0)) + return ret; + + iov_iter_save_state(&rw->s.iter, &rw->s.iter_state); + return 0; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir, bool do_import) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); unsigned ioprio; @@ -100,34 +246,58 @@ int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return 0; + return io_prep_rw_setup(req, ddir, do_import); } -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + return io_prep_rw(req, sqe, ITER_DEST, true); +} + +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw(req, sqe, ITER_SOURCE, true); +} + +static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, do_import); if (unlikely(ret)) return ret; + if (do_import) + return 0; /* * Have to do this validation here, as this is in io_read() rw->len * might have chanaged due to buffer selection */ - if (req->flags & REQ_F_BUFFER_SELECT) - return io_iov_buffer_select_prep(req); + return io_iov_buffer_select_prep(req); +} - return 0; +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_DEST); +} + +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rwv(req, sqe, ITER_SOURCE); } -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct io_ring_ctx *ctx = req->ctx; + struct io_async_rw *io; u16 index; int ret; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ddir, false); if (unlikely(ret)) return ret; @@ -136,7 +306,21 @@ int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); - return 0; + + io = req->async_data; + ret = io_import_fixed(ddir, &io->s.iter, req->imu, rw->addr, rw->len); + iov_iter_save_state(&io->s.iter, &io->s.iter_state); + return ret; +} + +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_DEST); +} + +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return io_prep_rw_fixed(req, sqe, ITER_SOURCE); } /* @@ -152,7 +336,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe); + ret = io_prep_rw(req, sqe, ITER_DEST, false); if (unlikely(ret)) return ret; @@ -165,9 +349,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - struct io_async_rw *io = req->async_data; - - kfree(io->free_iovec); + io_rw_iovec_free(req->async_data); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -188,14 +370,11 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) } #ifdef CONFIG_BLOCK -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; - if (!req_has_async_data(req)) - return !io_req_prep_async(req); iov_iter_restore(&io->s.iter, &io->s.iter_state); - return true; } static bool io_rw_should_reissue(struct io_kiocb *req) @@ -224,9 +403,8 @@ static bool io_rw_should_reissue(struct io_kiocb *req) return true; } #else -static bool io_resubmit_prep(struct io_kiocb *req) +static void io_resubmit_prep(struct io_kiocb *req) { - return false; } static bool io_rw_should_reissue(struct io_kiocb *req) { @@ -308,6 +486,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) req->cqe.flags |= io_put_kbuf(req, 0); + io_req_rw_cleanup(req, 0); io_req_task_complete(req, ts); } @@ -388,6 +567,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, issue_flags)); + io_req_rw_cleanup(req, issue_flags); return IOU_OK; } } else { @@ -396,71 +576,12 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; - if (io_resubmit_prep(req)) - return -EAGAIN; - else - io_req_task_queue_fail(req, final_ret); + io_resubmit_prep(req); + return -EAGAIN; } return IOU_ISSUE_SKIP_COMPLETE; } -static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, - struct io_rw_state *s, - unsigned int issue_flags) -{ - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct iov_iter *iter = &s->iter; - u8 opcode = req->opcode; - struct iovec *iovec; - void __user *buf; - size_t sqe_len; - ssize_t ret; - - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!io_issue_defs[opcode].vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return ERR_PTR(-ENOBUFS); - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - - ret = import_ubuf(ddir, buf, sqe_len, iter); - if (ret) - return ERR_PTR(ret); - return NULL; - } - - iovec = s->fast_iov; - ret = __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &iovec, iter, - req->ctx->compat); - if (unlikely(ret < 0)) - return ERR_PTR(ret); - return iovec; -} - -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct iovec **iovec, struct io_rw_state *s, - unsigned int issue_flags) -{ - *iovec = __io_import_iovec(rw, req, s, issue_flags); - if (IS_ERR(*iovec)) - return PTR_ERR(*iovec); - - iov_iter_save_state(&s->iter, &s->iter_state); - return 0; -} - static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) { return (kiocb->ki_filp->f_mode & FMODE_STREAM) ? NULL : &kiocb->ki_pos; @@ -532,89 +653,6 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) return ret; } -static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, struct iov_iter *iter) -{ - struct io_async_rw *io = req->async_data; - - memcpy(&io->s.iter, iter, sizeof(*iter)); - io->free_iovec = iovec; - io->bytes_done = 0; - /* can only be fixed buffers, no need to do anything */ - if (iov_iter_is_bvec(iter) || iter_is_ubuf(iter)) - return; - if (!iovec) { - unsigned iov_off = 0; - - io->s.iter.__iov = io->s.fast_iov; - if (iter->__iov != fast_iov) { - iov_off = iter_iov(iter) - fast_iov; - io->s.iter.__iov += iov_off; - } - if (io->s.fast_iov != fast_iov) - memcpy(io->s.fast_iov + iov_off, fast_iov + iov_off, - sizeof(struct iovec) * iter->nr_segs); - } else { - req->flags |= REQ_F_NEED_CLEANUP; - } -} - -static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - struct io_rw_state *s, bool force) -{ - if (!force && !io_cold_defs[req->opcode].prep_async) - return 0; - /* opcode type doesn't need async data */ - if (!io_cold_defs[req->opcode].async_size) - return 0; - if (!req_has_async_data(req)) { - struct io_async_rw *iorw; - - if (io_alloc_async_data(req)) { - kfree(iovec); - return -ENOMEM; - } - - io_req_map_rw(req, iovec, s->fast_iov, &s->iter); - iorw = req->async_data; - /* we've copied and mapped the iter, ensure state is saved */ - iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); - } - return 0; -} - -static inline int io_rw_prep_async(struct io_kiocb *req, int rw) -{ - struct io_async_rw *iorw = req->async_data; - struct iovec *iov; - int ret; - - iorw->bytes_done = 0; - iorw->free_iovec = NULL; - - /* submission path, ->uring_lock should already be taken */ - ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); - if (unlikely(ret < 0)) - return ret; - - if (iov) { - iorw->free_iovec = iov; - req->flags |= REQ_F_NEED_CLEANUP; - } - - return 0; -} - -int io_readv_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_DEST); -} - -int io_writev_prep_async(struct io_kiocb *req) -{ - return io_rw_prep_async(req, ITER_SOURCE); -} - /* * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. @@ -754,54 +792,28 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode) static int __io_read(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - struct io_async_rw *io; - ssize_t ret, ret2; + ssize_t ret; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); + if (io_do_buffer_select(req)) { + ret = io_import_iovec(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; - } else { - io = req->async_data; - s = &io->s; - - /* - * Safe and required to re-import if we're using provided - * buffers, as we dropped the selected one before retry. - */ - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } - - /* - * We come here from an earlier attempt, restore our state to - * match in case it doesn't. It's cheap enough that we don't - * need to make this conditional. - */ - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; } + ret = io_rw_init_file(req, FMODE_READ); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->s.iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ - if (unlikely(!io_file_supports_nowait(req))) { - ret = io_setup_async_rw(req, iovec, s, true); - return ret ?: -EAGAIN; - } + if (unlikely(!io_file_supports_nowait(req))) + return -EAGAIN; kiocb->ki_flags |= IOCB_NOWAIT; } else { /* Ensure we clear previously set non-block flag */ @@ -811,20 +823,15 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(READ, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->s.iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; - /* - * If we can poll, just do that. For a vectored read, we'll - * need to copy state first. - */ - if (io_file_can_poll(req) && !io_issue_defs[req->opcode].vectored) + /* If we can poll, just do that. */ + if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -834,8 +841,6 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) goto done; ret = 0; } else if (ret == -EIOCBQUEUED) { - if (iovec) - kfree(iovec); return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { @@ -848,21 +853,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(&s->iter, &s->iter_state); - - ret2 = io_setup_async_rw(req, iovec, s, true); - iovec = NULL; - if (ret2) { - ret = ret > 0 ? ret : ret2; - goto done; - } - - io = req->async_data; - s = &io->s; - /* - * Now use our persistent iterator and state, if we aren't already. - * We've restored and mapped the iter to match. - */ + iov_iter_restore(&io->s.iter, &io->s.iter_state); do { /* @@ -870,11 +861,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(&s->iter, ret); - if (!iov_iter_count(&s->iter)) + iov_iter_advance(&io->s.iter, ret); + if (!iov_iter_count(&io->s.iter)) break; io->bytes_done += ret; - iov_iter_save_state(&s->iter, &s->iter_state); + iov_iter_save_state(&io->s.iter, &io->s.iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -882,24 +873,22 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->s.iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(rw, &s->iter); + ret = io_iter_do_read(rw, &io->s.iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&s->iter, &s->iter_state); + iov_iter_restore(&io->s.iter, &io->s.iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); return ret; } @@ -908,8 +897,9 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) int ret; ret = __io_read(req, issue_flags); - if (ret >= 0) - return kiocb_done(req, ret, issue_flags); + if (ret >= 0) { + ret = kiocb_done(req, ret, issue_flags); + } return ret; } @@ -981,6 +971,7 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) * multishot request, hitting overflow will terminate it. */ io_req_set_res(req, ret, cflags); + io_req_rw_cleanup(req, issue_flags); if (issue_flags & IO_URING_F_MULTISHOT) return IOU_STOP_MULTISHOT; return IOU_OK; @@ -988,42 +979,28 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) int io_write(struct io_kiocb *req, unsigned int issue_flags) { + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_rw_state __s, *s = &__s; - struct iovec *iovec; + struct io_async_rw *io = req->async_data; struct kiocb *kiocb = &rw->kiocb; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; ssize_t ret, ret2; loff_t *ppos; - if (!req_has_async_data(req)) { - ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags); - if (unlikely(ret < 0)) - return ret; - } else { - struct io_async_rw *io = req->async_data; - - s = &io->s; - iov_iter_restore(&s->iter, &s->iter_state); - iovec = NULL; - } ret = io_rw_init_file(req, FMODE_WRITE); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } - req->cqe.res = iov_iter_count(&s->iter); + req->cqe.res = iov_iter_count(&io->s.iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ if (unlikely(!io_file_supports_nowait(req))) - goto copy_iov; + goto ret_eagain; /* File path supports NOWAIT for non-direct_IO only for block devices. */ if (!(kiocb->ki_flags & IOCB_DIRECT) && !(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) && (req->flags & REQ_F_ISREG)) - goto copy_iov; + goto ret_eagain; kiocb->ki_flags |= IOCB_NOWAIT; } else { @@ -1034,19 +1011,17 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ppos = io_kiocb_update_pos(req); ret = rw_verify_area(WRITE, req->file, ppos, req->cqe.res); - if (unlikely(ret)) { - kfree(iovec); + if (unlikely(ret)) return ret; - } if (req->flags & REQ_F_ISREG) kiocb_start_write(kiocb); kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &s->iter); + ret2 = call_write_iter(req->file, kiocb, &io->s.iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &s->iter); + ret2 = loop_rw_iter(WRITE, rw, &io->s.iter); else ret2 = -EINVAL; @@ -1067,11 +1042,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) - goto copy_iov; + goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { - struct io_async_rw *io; - trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2, req->cqe.res, ret2); @@ -1080,33 +1053,22 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) * in the worker. Also update bytes_done to account for * the bytes already written. */ - iov_iter_save_state(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, true); - - io = req->async_data; - if (io) - io->bytes_done += ret2; + iov_iter_save_state(&io->s.iter, &io->s.iter_state); + io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); - return ret ? ret : -EAGAIN; + return -EAGAIN; } done: ret = kiocb_done(req, ret2, issue_flags); } else { -copy_iov: - iov_iter_restore(&s->iter, &s->iter_state); - ret = io_setup_async_rw(req, iovec, s, false); - if (!ret) { - if (kiocb->ki_flags & IOCB_WRITE) - io_req_end_write(req); - return -EAGAIN; - } - return ret; +ret_eagain: + iov_iter_restore(&io->s.iter, &io->s.iter_state); + if (kiocb->ki_flags & IOCB_WRITE) + io_req_end_write(req); + return -EAGAIN; } - /* it's reportedly faster than delegating the null check to kfree() */ - if (iovec) - kfree(iovec); return ret; } @@ -1181,6 +1143,8 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) break; nr_events++; req->cqe.flags = io_put_kbuf(req, 0); + if (req->opcode != IORING_OP_URING_CMD) + io_req_rw_cleanup(req, 0); } if (unlikely(!nr_events)) return 0; @@ -1194,3 +1158,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) __io_submit_flush_completions(ctx); return nr_events; } + +void io_rw_cache_free(struct io_cache_entry *entry) +{ + struct io_async_rw *rw; + + rw = container_of(entry, struct io_async_rw, cache); + kfree(rw); +} diff --git a/io_uring/rw.h b/io_uring/rw.h index f9e89b4fe4da..f7905070d10b 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,21 +9,26 @@ struct io_rw_state { }; struct io_async_rw { + union { + size_t bytes_done; + struct io_cache_entry cache; + }; struct io_rw_state s; - const struct iovec *free_iovec; - size_t bytes_done; + struct iovec *free_iovec; struct wait_page_queue wpq; }; -int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); -int io_readv_prep_async(struct io_kiocb *req); int io_write(struct io_kiocb *req, unsigned int issue_flags); -int io_writev_prep_async(struct io_kiocb *req); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); +void io_rw_cache_free(struct io_cache_entry *entry); -- cgit v1.2.3 From 0d10bd77a1be0742a12e1bcf0554a4bcbdbc0f35 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 16:25:58 -0600 Subject: io_uring: get rid of struct io_rw_state A separate state struct is not needed anymore, just fold it in with io_async_rw. Signed-off-by: Jens Axboe --- io_uring/rw.c | 45 +++++++++++++++++++++++---------------------- io_uring/rw.h | 10 +++------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 2625e177c9bc..9743df5617fd 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -96,12 +96,12 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, rw->len = sqe_len; } - return import_ubuf(ddir, buf, sqe_len, &io->s.iter); + return import_ubuf(ddir, buf, sqe_len, &io->iter); } - io->free_iovec = io->s.fast_iov; + io->free_iovec = io->fast_iov; return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec, - &io->s.iter, req->ctx->compat); + &io->iter, req->ctx->compat); } static inline int io_import_iovec(int rw, struct io_kiocb *req, @@ -114,7 +114,7 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, if (unlikely(ret < 0)) return ret; - iov_iter_save_state(&io->s.iter, &io->s.iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); return 0; } @@ -216,7 +216,7 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) if (unlikely(ret < 0)) return ret; - iov_iter_save_state(&rw->s.iter, &rw->s.iter_state); + iov_iter_save_state(&rw->iter, &rw->iter_state); return 0; } @@ -308,8 +308,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe io_req_set_rsrc_node(req, ctx, 0); io = req->async_data; - ret = io_import_fixed(ddir, &io->s.iter, req->imu, rw->addr, rw->len); - iov_iter_save_state(&io->s.iter, &io->s.iter_state); + ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + iov_iter_save_state(&io->iter, &io->iter_state); return ret; } @@ -374,7 +374,7 @@ static void io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *io = req->async_data; - iov_iter_restore(&io->s.iter, &io->s.iter_state); + iov_iter_restore(&io->iter, &io->iter_state); } static bool io_rw_should_reissue(struct io_kiocb *req) @@ -808,7 +808,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_rw_init_file(req, FMODE_READ); if (unlikely(ret)) return ret; - req->cqe.res = iov_iter_count(&io->s.iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -826,7 +826,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; - ret = io_iter_do_read(rw, &io->s.iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -853,7 +853,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * untouched in case of error. Restore it and we'll advance it * manually if we need to. */ - iov_iter_restore(&io->s.iter, &io->s.iter_state); + iov_iter_restore(&io->iter, &io->iter_state); do { /* @@ -861,11 +861,11 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * above or inside this loop. Advance the iter by the bytes * that were consumed. */ - iov_iter_advance(&io->s.iter, ret); - if (!iov_iter_count(&io->s.iter)) + iov_iter_advance(&io->iter, ret); + if (!iov_iter_count(&io->iter)) break; io->bytes_done += ret; - iov_iter_save_state(&io->s.iter, &io->s.iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { @@ -873,19 +873,19 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } - req->cqe.res = iov_iter_count(&io->s.iter); + req->cqe.res = iov_iter_count(&io->iter); /* * Now retry read with the IOCB_WAITQ parts set in the iocb. If * we get -EIOCBQUEUED, then we'll get a notification when the * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(rw, &io->s.iter); + ret = io_iter_do_read(rw, &io->iter); if (ret == -EIOCBQUEUED) return IOU_ISSUE_SKIP_COMPLETE; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - iov_iter_restore(&io->s.iter, &io->s.iter_state); + iov_iter_restore(&io->iter, &io->iter_state); } while (ret > 0); done: /* it's faster to check here then delegate to kfree */ @@ -989,7 +989,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ret = io_rw_init_file(req, FMODE_WRITE); if (unlikely(ret)) return ret; - req->cqe.res = iov_iter_count(&io->s.iter); + req->cqe.res = iov_iter_count(&io->iter); if (force_nonblock) { /* If the file doesn't support async, just async punt */ @@ -1019,9 +1019,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) - ret2 = call_write_iter(req->file, kiocb, &io->s.iter); + ret2 = call_write_iter(req->file, kiocb, &io->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, rw, &io->s.iter); + ret2 = loop_rw_iter(WRITE, rw, &io->iter); else ret2 = -EINVAL; @@ -1053,7 +1053,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) * in the worker. Also update bytes_done to account for * the bytes already written. */ - iov_iter_save_state(&io->s.iter, &io->s.iter_state); + iov_iter_save_state(&io->iter, &io->iter_state); io->bytes_done += ret2; if (kiocb->ki_flags & IOCB_WRITE) @@ -1064,7 +1064,7 @@ done: ret = kiocb_done(req, ret2, issue_flags); } else { ret_eagain: - iov_iter_restore(&io->s.iter, &io->s.iter_state); + iov_iter_restore(&io->iter, &io->iter_state); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; @@ -1164,5 +1164,6 @@ void io_rw_cache_free(struct io_cache_entry *entry) struct io_async_rw *rw; rw = container_of(entry, struct io_async_rw, cache); + kfree(rw->free_iovec); kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index f7905070d10b..7824896dc52d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -2,18 +2,14 @@ #include -struct io_rw_state { - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; -}; - struct io_async_rw { union { size_t bytes_done; struct io_cache_entry cache; }; - struct io_rw_state s; + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov[UIO_FASTIOV]; struct iovec *free_iovec; struct wait_page_queue wpq; }; -- cgit v1.2.3 From cca6571381a0bdc88021a1f7a4c2349df21279f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 22 Mar 2024 20:41:18 -0600 Subject: io_uring/rw: cleanup retry path We no longer need to gate a potential retry on whether or not the context matches our original task, as all read/write operations have been fully prepared upfront. This means there's never any re-import needed, and hence we can always retry requests. Signed-off-by: Jens Axboe --- io_uring/rw.c | 35 ++++++++--------------------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index 9743df5617fd..ab4454aa8e30 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -369,16 +369,9 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -#ifdef CONFIG_BLOCK -static void io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - iov_iter_restore(&io->iter, &io->iter_state); -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; struct io_ring_ctx *ctx = req->ctx; @@ -394,23 +387,11 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; return true; -} #else -static void io_resubmit_prep(struct io_kiocb *req) -{ -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -575,8 +556,10 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, } if (req->flags & REQ_F_REISSUE) { + struct io_async_rw *io = req->async_data; + req->flags &= ~REQ_F_REISSUE; - io_resubmit_prep(req); + iov_iter_restore(&io->iter, &io->iter_state); return -EAGAIN; } return IOU_ISSUE_SKIP_COMPLETE; @@ -897,9 +880,8 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) int ret; ret = __io_read(req, issue_flags); - if (ret >= 0) { - ret = kiocb_done(req, ret, issue_flags); - } + if (ret >= 0) + return kiocb_done(req, ret, issue_flags); return ret; } @@ -1061,7 +1043,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) return -EAGAIN; } done: - ret = kiocb_done(req, ret2, issue_flags); + return kiocb_done(req, ret2, issue_flags); } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); @@ -1069,7 +1051,6 @@ ret_eagain: io_req_end_write(req); return -EAGAIN; } - return ret; } void io_rw_fail(struct io_kiocb *req) -- cgit v1.2.3 From d6f911a6b22f8e48aec82cd5f6b5a14dc76a56c3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 16:31:44 -0600 Subject: io_uring/rw: add iovec recycling Let the io_async_rw hold on to the iovec and reuse it, rather than always allocate and free them. Also enables KASAN for the iovec entries, so that reuse can be detected even while they are in the cache. While doing so, shrink io_async_rw by getting rid of the bigger embedded fast iovec. Since iovecs are being recycled now, shrink it from 8 to 1. This reduces the io_async_rw size from 264 to 160 bytes, a 40% reduction. Signed-off-by: Jens Axboe --- io_uring/rw.c | 42 +++++++++++++++++++++++++++++++++++++----- io_uring/rw.h | 3 ++- 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/io_uring/rw.c b/io_uring/rw.c index ab4454aa8e30..e84d322a6150 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -81,7 +81,9 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, { const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct iovec *iov; void __user *buf; + int nr_segs, ret; size_t sqe_len; buf = u64_to_user_ptr(rw->addr); @@ -99,9 +101,24 @@ static int __io_import_iovec(int ddir, struct io_kiocb *req, return import_ubuf(ddir, buf, sqe_len, &io->iter); } - io->free_iovec = io->fast_iov; - return __import_iovec(ddir, buf, sqe_len, UIO_FASTIOV, &io->free_iovec, - &io->iter, req->ctx->compat); + if (io->free_iovec) { + nr_segs = io->free_iov_nr; + iov = io->free_iovec; + } else { + iov = &io->fast_iov; + nr_segs = 1; + } + ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, + req->ctx->compat); + if (unlikely(ret < 0)) + return ret; + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io->free_iov_nr = io->iter.nr_segs; + kfree(io->free_iovec); + io->free_iovec = iov; + } + return 0; } static inline int io_import_iovec(int rw, struct io_kiocb *req, @@ -122,6 +139,7 @@ static void io_rw_iovec_free(struct io_async_rw *rw) { if (rw->free_iovec) { kfree(rw->free_iovec); + rw->free_iov_nr = 0; rw->free_iovec = NULL; } } @@ -129,12 +147,16 @@ static void io_rw_iovec_free(struct io_async_rw *rw) static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; + struct iovec *iov; if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { io_rw_iovec_free(rw); return; } + iov = rw->free_iovec; if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) { + if (iov) + kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } @@ -184,6 +206,11 @@ static int io_rw_alloc_async(struct io_kiocb *req) entry = io_alloc_cache_get(&ctx->rw_cache); if (entry) { rw = container_of(entry, struct io_async_rw, cache); + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + req->flags |= REQ_F_NEED_CLEANUP; + } req->flags |= REQ_F_ASYNC_DATA; req->async_data = rw; goto done; @@ -191,8 +218,9 @@ static int io_rw_alloc_async(struct io_kiocb *req) if (!io_alloc_async_data(req)) { rw = req->async_data; -done: rw->free_iovec = NULL; + rw->free_iov_nr = 0; +done: rw->bytes_done = 0; return 0; } @@ -1145,6 +1173,10 @@ void io_rw_cache_free(struct io_cache_entry *entry) struct io_async_rw *rw; rw = container_of(entry, struct io_async_rw, cache); - kfree(rw->free_iovec); + if (rw->free_iovec) { + kasan_mempool_unpoison_object(rw->free_iovec, + rw->free_iov_nr * sizeof(struct iovec)); + io_rw_iovec_free(rw); + } kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index 7824896dc52d..cf51d0eb407a 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -9,8 +9,9 @@ struct io_async_rw { }; struct iov_iter iter; struct iov_iter_state iter_state; - struct iovec fast_iov[UIO_FASTIOV]; + struct iovec fast_iov; struct iovec *free_iovec; + int free_iov_nr; struct wait_page_queue wpq; }; -- cgit v1.2.3 From e2ea5a7069133c01fe3dbda95d77af7f193a1a52 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 20:37:22 -0600 Subject: io_uring/net: move connect to always using async data While doing that, get rid of io_async_connect and just use the generic io_async_msghdr. Both of them have a struct sockaddr_storage in there, and while io_async_msghdr is bigger, if the same type can be used then the netmsg_cache can get reused for connect as well. Signed-off-by: Jens Axboe --- io_uring/net.c | 41 +++++++++++------------------------------ io_uring/net.h | 5 ----- io_uring/opdef.c | 3 +-- 3 files changed, 12 insertions(+), 37 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 1e1d77321fce..3bef562b67de 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1428,17 +1428,10 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } -int io_connect_prep_async(struct io_kiocb *req) -{ - struct io_async_connect *io = req->async_data; - struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); - - return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); -} - int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = io_kiocb_to_cmd(req, struct io_connect); + struct io_async_msghdr *io; if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; @@ -1446,32 +1439,26 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); conn->in_progress = conn->seen_econnaborted = false; - return 0; + + io = io_msg_alloc_async(req); + if (unlikely(!io)) + return -ENOMEM; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->addr); } int io_connect(struct io_kiocb *req, unsigned int issue_flags) { struct io_connect *connect = io_kiocb_to_cmd(req, struct io_connect); - struct io_async_connect __io, *io; + struct io_async_msghdr *io = req->async_data; unsigned file_flags; int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req_has_async_data(req)) { - io = req->async_data; - } else { - ret = move_addr_to_kernel(connect->addr, - connect->addr_len, - &__io.address); - if (ret) - goto out; - io = &__io; - } - file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, &io->address, - connect->addr_len, file_flags); + ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, + file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS || ret == -ECONNABORTED) && force_nonblock) { if (ret == -EINPROGRESS) { @@ -1481,13 +1468,6 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) goto out; connect->seen_econnaborted = true; } - if (req_has_async_data(req)) - return -EAGAIN; - if (io_alloc_async_data(req)) { - ret = -ENOMEM; - goto out; - } - memcpy(req->async_data, &__io, sizeof(__io)); return -EAGAIN; } if (connect->in_progress) { @@ -1505,6 +1485,7 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) out: if (ret < 0) req_set_fail(req); + io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); return IOU_OK; } diff --git a/io_uring/net.h b/io_uring/net.h index 0aef1c992aee..b47b43ec6459 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -28,10 +28,6 @@ struct io_async_msghdr { #if defined(CONFIG_NET) -struct io_async_connect { - struct sockaddr_storage address; -}; - int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_shutdown(struct io_kiocb *req, unsigned int issue_flags); @@ -53,7 +49,6 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags); int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_socket(struct io_kiocb *req, unsigned int issue_flags); -int io_connect_prep_async(struct io_kiocb *req); int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index fcae75a08f2c..1951107210d4 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -557,8 +557,7 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_CONNECT] = { .name = "CONNECT", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_connect), - .prep_async = io_connect_prep_async, + .async_size = sizeof(struct io_async_msghdr), #endif }, [IORING_OP_FALLOCATE] = { -- cgit v1.2.3 From d10f19dff56eac5ae44dc270336b18071a8bd51c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 20:41:58 -0600 Subject: io_uring/uring_cmd: switch to always allocating async data Basic conversion ensuring async_data is allocated off the prep path. Adds a basic alloc cache as well, as passthrough IO can be quite high in rate. Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 3 ++ io_uring/opdef.c | 1 - io_uring/uring_cmd.c | 77 ++++++++++++++++++++++++++++++------------ io_uring/uring_cmd.h | 10 +++++- 5 files changed, 69 insertions(+), 23 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 60d7e35fc303..0f24fdad19c2 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -301,6 +301,7 @@ struct io_ring_ctx { struct io_alloc_cache apoll_cache; struct io_alloc_cache netmsg_cache; struct io_alloc_cache rw_cache; + struct io_alloc_cache uring_cache; /* * Any cancelable uring_cmd is added to this list in diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b0554d122931..a868c0a253a6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -313,6 +313,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) sizeof(struct io_async_msghdr)); io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); + io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct uring_cache)); io_futex_cache_init(ctx); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); @@ -2826,6 +2828,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, io_uring_cache_free); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 1951107210d4..745246086c23 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -677,7 +677,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_URING_CMD] = { .name = "URING_CMD", .async_size = 2 * sizeof(struct io_uring_sqe), - .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 4614ce734fee..9bd0ba87553f 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -14,6 +14,38 @@ #include "rsrc.h" #include "uring_cmd.h" +static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_cache_entry *entry; + struct uring_cache *cache; + + entry = io_alloc_cache_get(&ctx->uring_cache); + if (entry) { + cache = container_of(entry, struct uring_cache, cache); + req->flags |= REQ_F_ASYNC_DATA; + req->async_data = cache; + return cache; + } + if (!io_alloc_async_data(req)) + return req->async_data; + return NULL; +} + +static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache = req->async_data; + + if (issue_flags & IO_URING_F_UNLOCKED) + return; + if (io_alloc_cache_put(&req->ctx->uring_cache, &cache->cache)) { + ioucmd->sqe = NULL; + req->async_data = NULL; + req->flags &= ~REQ_F_ASYNC_DATA; + } +} + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all) { @@ -128,6 +160,7 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, io_req_set_res(req, ret, 0); if (req->ctx->flags & IORING_SETUP_CQE32) io_req_set_cqe32_extra(req, res2, 0); + io_req_uring_cleanup(req, issue_flags); if (req->ctx->flags & IORING_SETUP_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); @@ -142,13 +175,19 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -int io_uring_cmd_prep_async(struct io_kiocb *req) +static int io_uring_cmd_prep_setup(struct io_kiocb *req, + const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct uring_cache *cache; - memcpy(req->async_data, ioucmd->sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = req->async_data; - return 0; + cache = io_uring_async_get(req); + if (cache) { + memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = req->async_data; + return 0; + } + return -ENOMEM; } int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -173,9 +212,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->imu = ctx->user_bufs[index]; io_req_set_rsrc_node(req, ctx, 0); } - ioucmd->sqe = sqe; ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return 0; + + return io_uring_cmd_prep_setup(req, sqe); } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -206,23 +245,14 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - if (!req_has_async_data(req)) { - if (io_alloc_async_data(req)) - return -ENOMEM; - io_uring_cmd_prep_async(req); - } - return -EAGAIN; - } - - if (ret != -EIOCBQUEUED) { - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); + if (ret == -EAGAIN || ret == -EIOCBQUEUED) return ret; - } - return IOU_ISSUE_SKIP_COMPLETE; + if (ret < 0) + req_set_fail(req); + io_req_uring_cleanup(req, issue_flags); + io_req_set_res(req, ret, 0); + return ret; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -311,3 +341,8 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) } EXPORT_SYMBOL_GPL(io_uring_cmd_sock); #endif + +void io_uring_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct uring_cache, cache)); +} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 7356bf9aa655..b0ccff7091ee 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,8 +1,16 @@ // SPDX-License-Identifier: GPL-2.0 +struct uring_cache { + union { + struct io_cache_entry cache; + struct io_uring_sqe sqes[2]; + }; +}; + int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_uring_cmd_prep_async(struct io_kiocb *req); +void io_uring_cache_free(struct io_cache_entry *entry); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all); \ No newline at end of file + struct task_struct *task, bool cancel_all); -- cgit v1.2.3 From 5eff57fa9f3aae3acbcaf196af507eec58955f3b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Mar 2024 15:23:47 -0600 Subject: io_uring/uring_cmd: defer SQE copying until it's needed The previous commit turned on async data for uring_cmd, and did the basic conversion of setting everything up on the prep side. However, for a lot of use cases, -EIOCBQUEUED will get returned on issue, as the operation got successfully queued. For that case, a persistent SQE isn't needed, as it's just used for issue. Unless execution goes async immediately, defer copying the double SQE until it's necessary. This greatly reduces the overhead of such commands, as evidenced by a perf diff from before and after this change: 10.60% -8.58% [kernel.vmlinux] [k] io_uring_cmd_prep where the prep side drops from 10.60% to ~2%, which is more expected. Performance also rises from ~113M IOPS to ~122M IOPS, bringing us back to where it was before the async command prep. Tested-by: Anuj Gupta Reviewed-by: Anuj Gupta Signed-off-by: Jens Axboe --- io_uring/uring_cmd.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9bd0ba87553f..92346b5d9f5b 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -182,12 +182,18 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, struct uring_cache *cache; cache = io_uring_async_get(req); - if (cache) { - memcpy(cache->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = req->async_data; + if (unlikely(!cache)) + return -ENOMEM; + + if (!(req->flags & REQ_F_FORCE_ASYNC)) { + /* defer memcpy until we need it */ + ioucmd->sqe = sqe; return 0; } - return -ENOMEM; + + memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = req->async_data; + return 0; } int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -245,8 +251,15 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN || ret == -EIOCBQUEUED) - return ret; + if (ret == -EAGAIN) { + struct uring_cache *cache = req->async_data; + + if (ioucmd->sqe != (void *) cache) + memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); + return -EAGAIN; + } else if (ret == -EIOCBQUEUED) { + return -EIOCBQUEUED; + } if (ret < 0) req_set_fail(req); -- cgit v1.2.3 From e10677a8f6980dbae2e866b8320d90bae07e87ee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 Mar 2024 20:48:38 -0600 Subject: io_uring: drop ->prep_async() It's now unused, drop the code related to it. This includes the io_issue_defs->manual alloc field. While in there, and since ->async_size is now being used a bit more frequently and in the issue path, move it to io_issue_defs[]. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 36 ++++-------------------------------- io_uring/io_uring.h | 1 - io_uring/opdef.c | 43 ++++++++++++++++++------------------------- io_uring/opdef.h | 9 +++------ io_uring/uring_cmd.h | 1 - 5 files changed, 25 insertions(+), 65 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a868c0a253a6..579618fad833 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1712,8 +1712,10 @@ io_req_flags_t io_file_get_flags(struct file *file) bool io_alloc_async_data(struct io_kiocb *req) { - WARN_ON_ONCE(!io_cold_defs[req->opcode].async_size); - req->async_data = kmalloc(io_cold_defs[req->opcode].async_size, GFP_KERNEL); + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); if (req->async_data) { req->flags |= REQ_F_ASYNC_DATA; return false; @@ -1721,25 +1723,6 @@ bool io_alloc_async_data(struct io_kiocb *req) return true; } -int io_req_prep_async(struct io_kiocb *req) -{ - const struct io_cold_def *cdef = &io_cold_defs[req->opcode]; - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - /* assign early for deferred execution for non-fixed file */ - if (def->needs_file && !(req->flags & REQ_F_FIXED_FILE) && !req->file) - req->file = io_file_get_normal(req, req->cqe.fd); - if (!cdef->prep_async) - return 0; - if (WARN_ON_ONCE(req_has_async_data(req))) - return -EFAULT; - if (!def->manual_alloc) { - if (io_alloc_async_data(req)) - return -EAGAIN; - } - return cdef->prep_async(req); -} - static u32 io_get_sequence(struct io_kiocb *req) { u32 seq = req->ctx->cached_sq_head; @@ -2057,13 +2040,6 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { - int ret = io_req_prep_async(req); - - if (unlikely(ret)) { - io_req_defer_failed(req, ret); - return; - } - if (unlikely(req->ctx->drain_active)) io_drain_req(req); else @@ -2273,10 +2249,6 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, * conditions are true (normal request), then just queue it. */ if (unlikely(link->head)) { - ret = io_req_prep_async(req); - if (unlikely(ret)) - return io_submit_fail_init(sqe, req, ret); - trace_io_uring_link(req, link->head); link->last->link = req; link->last = req; diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index bae8c1e937c1..7654dfb34c2e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -95,7 +95,6 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); -int io_req_prep_async(struct io_kiocb *req); struct io_wq_work *io_wq_free_work(struct io_wq_work *work); void io_wq_submit_work(struct io_wq_work *work); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 745246086c23..a16f73938ebb 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -67,6 +67,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_readv, .issue = io_read, }, @@ -81,6 +82,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll = 1, .iopoll_queue = 1, .vectored = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_writev, .issue = io_write, }, @@ -99,6 +101,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, .issue = io_read, }, @@ -112,6 +115,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, .issue = io_write, }, @@ -138,8 +142,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_sendmsg, #else @@ -152,8 +156,8 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recvmsg, #else @@ -162,6 +166,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_timeout_prep, .issue = io_timeout, }, @@ -191,6 +196,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, + .async_size = sizeof(struct io_timeout_data), .prep = io_link_timeout_prep, .issue = io_no_issue, }, @@ -199,6 +205,7 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_connect_prep, .issue = io_connect, #else @@ -239,6 +246,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_read, .issue = io_read, }, @@ -252,6 +260,7 @@ const struct io_issue_def io_issue_defs[] = { .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_prep_write, .issue = io_write, }, @@ -272,8 +281,8 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, .issue = io_send, #else @@ -288,6 +297,7 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_recvmsg_prep, .issue = io_recv, #else @@ -403,6 +413,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, + .async_size = 2 * sizeof(struct io_uring_sqe), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -412,8 +423,8 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_send_zc, #else @@ -425,8 +436,8 @@ const struct io_issue_def io_issue_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, - .manual_alloc = 1, #if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else @@ -439,10 +450,12 @@ const struct io_issue_def io_issue_defs[] = { .pollin = 1, .buffer_select = 1, .audit_skip = 1, + .async_size = sizeof(struct io_async_rw), .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { + .async_size = sizeof(struct io_waitid_async), .prep = io_waitid_prep, .issue = io_waitid, }, @@ -488,13 +501,11 @@ const struct io_cold_def io_cold_defs[] = { .name = "NOP", }, [IORING_OP_READV] = { - .async_size = sizeof(struct io_async_rw), .name = "READV", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITEV", .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, @@ -503,12 +514,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", .fail = io_rw_fail, }, @@ -524,7 +533,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -532,13 +540,11 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { @@ -551,14 +557,10 @@ const struct io_cold_def io_cold_defs[] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { - .async_size = sizeof(struct io_timeout_data), .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", -#if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), -#endif }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", @@ -578,12 +580,10 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { - .async_size = sizeof(struct io_async_rw), .name = "READ", .fail = io_rw_fail, }, [IORING_OP_WRITE] = { - .async_size = sizeof(struct io_async_rw), .name = "WRITE", .fail = io_rw_fail, }, @@ -596,7 +596,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -604,7 +603,6 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif @@ -676,12 +674,10 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", - .async_size = 2 * sizeof(struct io_uring_sqe), }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif @@ -689,18 +685,15 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) - .async_size = sizeof(struct io_async_msghdr), .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_READ_MULTISHOT] = { - .async_size = sizeof(struct io_async_rw), .name = "READ_MULTISHOT", }, [IORING_OP_WAITID] = { .name = "WAITID", - .async_size = sizeof(struct io_waitid_async), }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 9e5435ec27d0..7ee6f5aa90aa 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -27,22 +27,19 @@ struct io_issue_def { unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; - /* opcode specific path will handle ->async_data allocation if needed */ - unsigned manual_alloc : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ unsigned vectored : 1; + /* size of async data needed, if any */ + unsigned short async_size; + int (*issue)(struct io_kiocb *, unsigned int); int (*prep)(struct io_kiocb *, const struct io_uring_sqe *); }; struct io_cold_def { - /* size of async data needed, if any */ - unsigned short async_size; - const char *name; - int (*prep_async)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index b0ccff7091ee..477ea8865639 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -9,7 +9,6 @@ struct uring_cache { int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_uring_cmd_prep_async(struct io_kiocb *req); void io_uring_cache_free(struct io_cache_entry *entry); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, -- cgit v1.2.3 From 414d0f45c316221acbf066658afdbae5b354a5cc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 Mar 2024 15:19:44 -0600 Subject: io_uring/alloc_cache: switch to array based caching Currently lists are being used to manage this, but best practice is usually to have these in an array instead as that it cheaper to manage. Outside of that detail, games are also played with KASAN as the list is inside the cached entry itself. Finally, all users of this need a struct io_cache_entry embedded in their struct, which is union'ized with something else in there that isn't used across the free -> realloc cycle. Get rid of all of that, and simply have it be an array. This will not change the memory used, as we're just trading an 8-byte member entry for the per-elem array size. This reduces the overhead of the recycled allocations, and it reduces the amount of code code needed to support recycling to about half of what it currently is. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/alloc_cache.h | 57 +++++++++++++++++++----------------------- io_uring/futex.c | 30 +++++++++------------- io_uring/futex.h | 5 ++-- io_uring/io_uring.c | 34 ++++++++++++++----------- io_uring/net.c | 13 ++++------ io_uring/net.h | 18 ++++--------- io_uring/poll.c | 12 +++------ io_uring/poll.h | 9 +------ io_uring/rsrc.c | 10 +++----- io_uring/rsrc.h | 7 +----- io_uring/rw.c | 14 +++++------ io_uring/rw.h | 7 ++---- io_uring/uring_cmd.c | 14 +++-------- io_uring/uring_cmd.h | 6 +---- 15 files changed, 93 insertions(+), 145 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0f24fdad19c2..ef45b8bd1b35 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -220,7 +220,7 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct io_wq_work_node list; + void **entries; unsigned int nr_cached; unsigned int max_cached; size_t elem_size; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 138ad14b0b12..b7a38a2069cf 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -6,61 +6,56 @@ */ #define IO_ALLOC_CACHE_MAX 128 -struct io_cache_entry { - struct io_wq_work_node node; -}; - static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, - struct io_cache_entry *entry) + void *entry) { if (cache->nr_cached < cache->max_cached) { - cache->nr_cached++; - wq_stack_add_head(&entry->node, &cache->list); - kasan_mempool_poison_object(entry); + if (!kasan_mempool_poison_object(entry)) + return false; + cache->entries[cache->nr_cached++] = entry; return true; } return false; } -static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) -{ - return !cache->list.next; -} - -static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) +static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (cache->list.next) { - struct io_cache_entry *entry; + if (cache->nr_cached) { + void *entry = cache->entries[--cache->nr_cached]; - entry = container_of(cache->list.next, struct io_cache_entry, node); kasan_mempool_unpoison_object(entry, cache->elem_size); - cache->list.next = cache->list.next->next; - cache->nr_cached--; return entry; } return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, +/* returns false if the cache was initialized properly */ +static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, unsigned max_nr, size_t size) { - cache->list.next = NULL; - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (cache->entries) { + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + return false; + } + return true; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(struct io_cache_entry *)) + void (*free)(const void *)) { - while (1) { - struct io_cache_entry *entry = io_alloc_cache_get(cache); + void *entry; + + if (!cache->entries) + return; - if (!entry) - break; + while ((entry = io_alloc_cache_get(cache)) != NULL) free(entry); - } - cache->nr_cached = 0; + + kvfree(cache->entries); + cache->entries = NULL; } #endif diff --git a/io_uring/futex.c b/io_uring/futex.c index 792a03df58de..914848f46beb 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -9,7 +9,7 @@ #include "../kernel/futex/futex.h" #include "io_uring.h" -#include "rsrc.h" +#include "alloc_cache.h" #include "futex.h" struct io_futex { @@ -27,27 +27,21 @@ struct io_futex { }; struct io_futex_data { - union { - struct futex_q q; - struct io_cache_entry cache; - }; + struct futex_q q; struct io_kiocb *req; }; -void io_futex_cache_init(struct io_ring_ctx *ctx) -{ - io_alloc_cache_init(&ctx->futex_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); -} +#define IO_FUTEX_ALLOC_CACHE_MAX 32 -static void io_futex_cache_entry_free(struct io_cache_entry *entry) +bool io_futex_cache_init(struct io_ring_ctx *ctx) { - kfree(container_of(entry, struct io_futex_data, cache)); + return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, + sizeof(struct io_futex_data)); } void io_futex_cache_free(struct io_ring_ctx *ctx) { - io_alloc_cache_free(&ctx->futex_cache, io_futex_cache_entry_free); + io_alloc_cache_free(&ctx->futex_cache, kfree); } static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) @@ -63,7 +57,7 @@ static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) struct io_ring_ctx *ctx = req->ctx; io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, &ifd->cache)) + if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); __io_futex_complete(req, ts); } @@ -259,11 +253,11 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) { - struct io_cache_entry *entry; + struct io_futex_data *ifd; - entry = io_alloc_cache_get(&ctx->futex_cache); - if (entry) - return container_of(entry, struct io_futex_data, cache); + ifd = io_alloc_cache_get(&ctx->futex_cache); + if (ifd) + return ifd; return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); } diff --git a/io_uring/futex.h b/io_uring/futex.h index 0847e9e8a127..b8bb09873d57 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -13,7 +13,7 @@ int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -void io_futex_cache_init(struct io_ring_ctx *ctx); +bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); #else static inline int io_futex_cancel(struct io_ring_ctx *ctx, @@ -27,8 +27,9 @@ static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, { return false; } -static inline void io_futex_cache_init(struct io_ring_ctx *ctx) +static inline bool io_futex_cache_init(struct io_ring_ctx *ctx) { + return false; } static inline void io_futex_cache_free(struct io_ring_ctx *ctx) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 579618fad833..1d453eb8e49f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -276,6 +276,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; + bool ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -305,17 +306,19 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); INIT_HLIST_HEAD(&ctx->io_buf_list); - io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); - io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_rw)); - io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, sizeof(struct uring_cache)); - io_futex_cache_init(ctx); + ret |= io_futex_cache_init(ctx); + if (ret) + goto err; init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); @@ -345,6 +348,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) return ctx; err: + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); + io_futex_cache_free(ctx); kfree(ctx->cancel_table.hbs); kfree(ctx->cancel_table_locked.hbs); xa_destroy(&ctx->io_bl_xa); @@ -1482,7 +1491,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, &apoll->cache)) + if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) kfree(apoll); req->flags &= ~REQ_F_POLLED; } @@ -2778,11 +2787,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_rsrc_node_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct io_rsrc_node, cache)); -} - static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2797,10 +2801,10 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_files_unregister(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); + io_alloc_cache_free(&ctx->apoll_cache, kfree); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, io_uring_cache_free); + io_alloc_cache_free(&ctx->uring_cache, kfree); io_futex_cache_free(ctx); io_destroy_buffers(ctx); mutex_unlock(&ctx->uring_lock); @@ -2816,7 +2820,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); + io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; diff --git a/io_uring/net.c b/io_uring/net.c index 3bef562b67de..d0abc5689066 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -137,7 +137,7 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) /* Let normal cleanup path reap it if we fail adding to the cache */ iov = hdr->free_iov; - if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) { + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { if (iov) kasan_mempool_poison_object(iov); req->async_data = NULL; @@ -148,12 +148,10 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct io_async_msghdr *hdr; - entry = io_alloc_cache_get(&ctx->netmsg_cache); - if (entry) { - hdr = container_of(entry, struct io_async_msghdr, cache); + hdr = io_alloc_cache_get(&ctx->netmsg_cache); + if (hdr) { if (hdr->free_iov) { kasan_mempool_unpoison_object(hdr->free_iov, hdr->free_iov_nr * sizeof(struct iovec)); @@ -1490,11 +1488,10 @@ out: return IOU_OK; } -void io_netmsg_cache_free(struct io_cache_entry *entry) +void io_netmsg_cache_free(const void *entry) { - struct io_async_msghdr *kmsg; + struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - kmsg = container_of(entry, struct io_async_msghdr, cache); if (kmsg->free_iov) { kasan_mempool_unpoison_object(kmsg->free_iov, kmsg->free_iov_nr * sizeof(struct iovec)); diff --git a/io_uring/net.h b/io_uring/net.h index b47b43ec6459..0eb1c1920fc9 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -3,23 +3,15 @@ #include #include -#include "alloc_cache.h" - struct io_async_msghdr { #if defined(CONFIG_NET) - union { - struct iovec fast_iov; - struct { - struct io_cache_entry cache; - /* entry size of ->free_iov, if valid */ - int free_iov_nr; - }; - }; + struct iovec fast_iov; /* points to an allocated iov, if NULL we use fast_iov instead */ struct iovec *free_iov; + int free_iov_nr; + int namelen; __kernel_size_t controllen; __kernel_size_t payloadlen; - int namelen; struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; @@ -57,9 +49,9 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); -void io_netmsg_cache_free(struct io_cache_entry *entry); +void io_netmsg_cache_free(const void *entry); #else -static inline void io_netmsg_cache_free(struct io_cache_entry *entry) +static inline void io_netmsg_cache_free(const void *entry) { } #endif diff --git a/io_uring/poll.c b/io_uring/poll.c index 5d55bbf1de15..0a8e02944689 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -14,6 +14,7 @@ #include #include "io_uring.h" +#include "alloc_cache.h" #include "refs.h" #include "napi.h" #include "opdef.h" @@ -686,17 +687,15 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct async_poll *apoll; if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - entry = io_alloc_cache_get(&ctx->apoll_cache); - if (entry == NULL) + apoll = io_alloc_cache_get(&ctx->apoll_cache); + if (!apoll) goto alloc_apoll; - apoll = container_of(entry, struct async_poll, cache); apoll->poll.retries = APOLL_MAX_RETRY; } else { alloc_apoll: @@ -1055,8 +1054,3 @@ out: io_req_set_res(req, ret, 0); return IOU_OK; } - -void io_apoll_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct async_poll, cache)); -} diff --git a/io_uring/poll.h b/io_uring/poll.h index 1dacae9e816c..5c240f11069a 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -#include "alloc_cache.h" - enum { IO_APOLL_OK, IO_APOLL_ABORTED, @@ -17,10 +15,7 @@ struct io_poll { }; struct async_poll { - union { - struct io_poll poll; - struct io_cache_entry cache; - }; + struct io_poll poll; struct io_poll *double_poll; }; @@ -46,6 +41,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, bool cancel_all); -void io_apoll_cache_free(struct io_cache_entry *entry); - void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 4818b79231dd..7b8a056f98ed 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -13,6 +13,7 @@ #include #include "io_uring.h" +#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" @@ -169,7 +170,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) kfree(node); } @@ -197,12 +198,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; - struct io_cache_entry *entry; - entry = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (entry) { - ref_node = container_of(entry, struct io_rsrc_node, cache); - } else { + ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (!ref_node) { ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); if (!ref_node) return NULL; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index e21000238954..83c079a707f8 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,8 +2,6 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#include "alloc_cache.h" - #define IO_NODE_ALLOC_CACHE_MAX 32 #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) @@ -36,10 +34,7 @@ struct io_rsrc_data { }; struct io_rsrc_node { - union { - struct io_cache_entry cache; - struct io_ring_ctx *ctx; - }; + struct io_ring_ctx *ctx; int refs; bool empty; u16 type; diff --git a/io_uring/rw.c b/io_uring/rw.c index e84d322a6150..3134a6ece1be 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -18,6 +18,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" +#include "alloc_cache.h" #include "rsrc.h" #include "poll.h" #include "rw.h" @@ -154,7 +155,7 @@ static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) return; } iov = rw->free_iovec; - if (io_alloc_cache_put(&req->ctx->rw_cache, &rw->cache)) { + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { if (iov) kasan_mempool_poison_object(iov); req->async_data = NULL; @@ -200,12 +201,10 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) static int io_rw_alloc_async(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct io_async_rw *rw; - entry = io_alloc_cache_get(&ctx->rw_cache); - if (entry) { - rw = container_of(entry, struct io_async_rw, cache); + rw = io_alloc_cache_get(&ctx->rw_cache); + if (rw) { if (rw->free_iovec) { kasan_mempool_unpoison_object(rw->free_iovec, rw->free_iov_nr * sizeof(struct iovec)); @@ -1168,11 +1167,10 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) return nr_events; } -void io_rw_cache_free(struct io_cache_entry *entry) +void io_rw_cache_free(const void *entry) { - struct io_async_rw *rw; + struct io_async_rw *rw = (struct io_async_rw *) entry; - rw = container_of(entry, struct io_async_rw, cache); if (rw->free_iovec) { kasan_mempool_unpoison_object(rw->free_iovec, rw->free_iov_nr * sizeof(struct iovec)); diff --git a/io_uring/rw.h b/io_uring/rw.h index cf51d0eb407a..3f432dc75441 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -3,10 +3,7 @@ #include struct io_async_rw { - union { - size_t bytes_done; - struct io_cache_entry cache; - }; + size_t bytes_done; struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; @@ -28,4 +25,4 @@ void io_rw_fail(struct io_kiocb *req); void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); -void io_rw_cache_free(struct io_cache_entry *entry); +void io_rw_cache_free(const void *entry); diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 92346b5d9f5b..334d31dd6628 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -11,18 +11,17 @@ #include #include "io_uring.h" +#include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" static struct uring_cache *io_uring_async_get(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_cache_entry *entry; struct uring_cache *cache; - entry = io_alloc_cache_get(&ctx->uring_cache); - if (entry) { - cache = container_of(entry, struct uring_cache, cache); + cache = io_alloc_cache_get(&ctx->uring_cache); + if (cache) { req->flags |= REQ_F_ASYNC_DATA; req->async_data = cache; return cache; @@ -39,7 +38,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (issue_flags & IO_URING_F_UNLOCKED) return; - if (io_alloc_cache_put(&req->ctx->uring_cache, &cache->cache)) { + if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { ioucmd->sqe = NULL; req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; @@ -354,8 +353,3 @@ int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) } EXPORT_SYMBOL_GPL(io_uring_cmd_sock); #endif - -void io_uring_cache_free(struct io_cache_entry *entry) -{ - kfree(container_of(entry, struct uring_cache, cache)); -} diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 477ea8865639..a361f98664d2 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,15 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 struct uring_cache { - union { - struct io_cache_entry cache; - struct io_uring_sqe sqes[2]; - }; + struct io_uring_sqe sqes[2]; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -void io_uring_cache_free(struct io_cache_entry *entry); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); -- cgit v1.2.3 From da22bdf38be2f2ba557d3031108614ebbba265e1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Mar 2024 07:53:07 -0600 Subject: io_uring/poll: shrink alloc cache size to 32 This should be plenty, rather than the default of 128, and matches what we have on the rsrc and futex side as well. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/poll.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d453eb8e49f..8e53b93eeb18 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -308,7 +308,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_HLIST_HEAD(&ctx->io_buf_list); ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); - ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, sizeof(struct async_poll)); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_async_msghdr)); diff --git a/io_uring/poll.h b/io_uring/poll.h index 5c240f11069a..b0e3745f5a29 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#define IO_POLL_ALLOC_CACHE_MAX 32 + enum { IO_APOLL_OK, IO_APOLL_ABORTED, -- cgit v1.2.3 From 05eb5fe226461c6459b81f109a9c23b46ed8bc3b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Mar 2024 19:07:22 -0600 Subject: io_uring: refill request cache in memory order The allocator will generally return memory in order, but __io_alloc_req_refill() then adds them to a stack and we'll extract them in the opposite order. This obviously isn't a huge deal, but: 1) it makes debugging easier when they are in order 2) keeping them in-order is the right thing to do 3) reduces the code for adding them to the stack Just add them in reverse to the stack. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8e53b93eeb18..8489f1820ad9 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1039,7 +1039,7 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) { gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; void *reqs[IO_REQ_ALLOC_BATCH]; - int ret, i; + int ret; /* * If we have more than a batch's worth of requests in our IRQ side @@ -1066,8 +1066,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); - for (i = 0; i < ret; i++) { - struct io_kiocb *req = reqs[i]; + while (ret--) { + struct io_kiocb *req = reqs[ret]; io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); -- cgit v1.2.3 From 77a1cd5e795726235469c31cf13a25116bf2d712 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 26 Mar 2024 22:13:41 -0600 Subject: io_uring: re-arrange Makefile order The object list is a bit of a mess, with core and opcode files mixed in. Re-arrange it so that we have the core bits first, and then opcode specific files after that. Signed-off-by: Jens Axboe --- io_uring/Makefile | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/io_uring/Makefile b/io_uring/Makefile index 2e1d4e03799c..bd7c692a6a7c 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -2,13 +2,13 @@ # # Makefile for io_uring -obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \ - sync.o advise.o filetable.o \ - openclose.o uring_cmd.o epoll.o \ - statx.o net.o msg_ring.o timeout.o \ - sqpoll.o fdinfo.o tctx.o poll.o \ - cancel.o kbuf.o rsrc.o rw.o opdef.o \ - notif.o waitid.o register.o truncate.o +obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ + tctx.o filetable.o rw.o net.o poll.o \ + uring_cmd.o openclose.o sqpoll.o \ + xattr.o nop.o fs.o splice.o sync.o \ + msg_ring.o advise.o openclose.o \ + epoll.o statx.o timeout.o fdinfo.o \ + cancel.o waitid.o register.o truncate.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o -- cgit v1.2.3 From 4e9706c6c8d1b159e2d04bee60189a1361f5afa9 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Thu, 28 Mar 2024 10:23:24 +0800 Subject: io_uring: Remove unused function The function are defined in the io_uring.c file, but not called elsewhere, so delete the unused function. io_uring/io_uring.c:646:20: warning: unused function '__io_cq_unlock'. Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=8660 Signed-off-by: Jiapeng Chong Link: https://lore.kernel.org/r/20240328022324.78029-1-jiapeng.chong@linux.alibaba.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8489f1820ad9..0bee0ee5c7b2 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -643,12 +643,6 @@ static inline void __io_cq_lock(struct io_ring_ctx *ctx) spin_lock(&ctx->completion_lock); } -static inline void __io_cq_unlock(struct io_ring_ctx *ctx) -{ - if (!ctx->lockless_cq) - spin_unlock(&ctx->completion_lock); -} - static inline void io_cq_lock(struct io_ring_ctx *ctx) __acquires(ctx->completion_lock) { -- cgit v1.2.3 From a80929d1cd530be36d482218d4e9ec09ecd5204f Mon Sep 17 00:00:00 2001 From: Joel Granados Date: Thu, 28 Mar 2024 16:57:53 +0100 Subject: io_uring: Remove the now superfluous sentinel elements from ctl_table array This commit comes at the tail end of a greater effort to remove the empty elements at the end of the ctl_table arrays (sentinels) which will reduce the overall build time size of the kernel and run time memory bloat by ~64 bytes per sentinel (further information Link : https://lore.kernel.org/all/ZO5Yx5JFogGi%2FcBo@bombadil.infradead.org/) Remove sentinel element from kernel_io_uring_disabled_table Signed-off-by: Joel Granados Link: https://lore.kernel.org/r/20240328-jag-sysctl_remset_misc-v1-6-47c1463b3af2@samsung.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0bee0ee5c7b2..2d091f72ba6c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -171,7 +171,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - {}, }; #endif -- cgit v1.2.3 From 22537c9f79417fed70b352d54d01d2586fee9521 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 25 Mar 2024 18:53:33 -0600 Subject: io_uring: use the right type for work_llist empty check io_task_work_pending() uses wq_list_empty() on ctx->work_llist, but it's not an io_wq_work_list, it's a struct llist_head. They both have ->first as head-of-list, and it turns out the checks are identical. But be proper and use the right helper. Fixes: dac6a0eae793 ("io_uring: ensure iopoll runs local task work as well") Signed-off-by: Jens Axboe --- io_uring/io_uring.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 7654dfb34c2e..dbd9a2b870eb 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -340,7 +340,7 @@ static inline int io_run_task_work(void) static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); + return task_work_pending(current) || !llist_empty(&ctx->work_llist); } static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) -- cgit v1.2.3 From 0f21a9574b1d04afbf818a3e6a60cb95eb04a616 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Thu, 28 Mar 2024 17:09:35 -0400 Subject: io_uring: Avoid anonymous enums in io_uring uapi While valid C, anonymous enums confuse Cython (Python to C translator), as reported by Ritesh (YoSTEALTH) [1] . Since people rely on it when building against liburing and we want to keep this header in sync with the library version, let's name the existing enums in the uapi header. [1] https://github.com/cython/cython/issues/3240 Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240328210935.25640-1-krisman@suse.de Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7bd10201a02b..a7f847543a7f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -115,7 +115,7 @@ struct io_uring_sqe { */ #define IORING_FILE_INDEX_ALLOC (~0U) -enum { +enum io_uring_sqe_flags_bit { IOSQE_FIXED_FILE_BIT, IOSQE_IO_DRAIN_BIT, IOSQE_IO_LINK_BIT, @@ -374,7 +374,7 @@ enum io_uring_op { /* * IORING_OP_MSG_RING command types, stored in sqe->addr */ -enum { +enum io_uring_msg_ring_flags { IORING_MSG_DATA, /* pass sqe->len as 'res' and off as user_data */ IORING_MSG_SEND_FD, /* send a registered fd to another ring */ }; @@ -425,9 +425,7 @@ struct io_uring_cqe { #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) #define IORING_CQE_F_NOTIF (1U << 3) -enum { - IORING_CQE_BUFFER_SHIFT = 16, -}; +#define IORING_CQE_BUFFER_SHIFT 16 /* * Magic offsets for the application to mmap the data it needs @@ -526,7 +524,7 @@ struct io_uring_params { /* * io_uring_register(2) opcodes and arguments */ -enum { +enum io_uring_register_op { IORING_REGISTER_BUFFERS = 0, IORING_UNREGISTER_BUFFERS = 1, IORING_REGISTER_FILES = 2, @@ -583,7 +581,7 @@ enum { }; /* io-wq worker categories */ -enum { +enum io_wq_type { IO_WQ_BOUND, IO_WQ_UNBOUND, }; @@ -688,7 +686,7 @@ struct io_uring_buf_ring { * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT) * to get a virtual mapping for the ring. */ -enum { +enum io_uring_register_pbuf_ring_flags { IOU_PBUF_RING_MMAP = 1, }; @@ -719,7 +717,7 @@ struct io_uring_napi { /* * io_uring_restriction->opcode values */ -enum { +enum io_uring_register_restriction_op { /* Allow an io_uring_register(2) opcode */ IORING_RESTRICTION_REGISTER_OP = 0, @@ -775,7 +773,7 @@ struct io_uring_recvmsg_out { /* * Argument for IORING_OP_URING_CMD when file is a socket */ -enum { +enum io_uring_socket_op { SOCKET_URING_OP_SIOCINQ = 0, SOCKET_URING_OP_SIOCOUTQ, SOCKET_URING_OP_GETSOCKOPT, -- cgit v1.2.3 From 62346c6cb28b043f2a6e95337d9081ec0b37b5f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 16 Mar 2024 07:21:43 -0600 Subject: mm: add nommu variant of vm_insert_pages() An identical one exists for vm_insert_page(), add one for vm_insert_pages() to avoid needing to check for CONFIG_MMU in code using it. Acked-by: Johannes Weiner Signed-off-by: Jens Axboe --- mm/nommu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/nommu.c b/mm/nommu.c index 5ec8f44e7ce9..a34a0e376611 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -355,6 +355,13 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_insert_pages); + int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { -- cgit v1.2.3 From 3ab1db3c6039e02a9deb9d5091d28d559917a645 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2024 09:56:14 -0600 Subject: io_uring: get rid of remap_pfn_range() for mapping rings/sqes Rather than use remap_pfn_range() for this and manually free later, switch to using vm_insert_pages() and have it Just Work. If possible, allocate a single compound page that covers the range that is needed. If that works, then we can just use page_address() on that page. If we fail to get a compound page, allocate single pages and use vmap() to map them into the kernel virtual address space. This just covers the rings/sqes, the other remaining user of the mmap remap_pfn_range() user will be converted separately. Once that is done, we can kill the old alloc/free code. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++--- io_uring/io_uring.h | 2 + 2 files changed, 133 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2d091f72ba6c..fba68c37a77d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2599,6 +2599,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } +static void io_pages_unmap(void *ptr, struct page ***pages, + unsigned short *npages) +{ + bool do_vunmap = false; + + if (!ptr) + return; + + if (*npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + void io_mem_free(void *ptr) { if (!ptr) @@ -2699,8 +2729,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_mem_free(ctx->rings); - io_mem_free(ctx->sq_sqes); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2712,6 +2742,80 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +static void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; +} + void *io_mem_alloc(size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; @@ -3298,14 +3402,12 @@ static void *io_uring_validate_mmap_request(struct file *file, /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->rings; - break; + return ctx->rings; case IORING_OFF_SQES: /* Don't allow mmap if the ring was setup without it */ if (ctx->flags & IORING_SETUP_NO_MMAP) return ERR_PTR(-EINVAL); - ptr = ctx->sq_sqes; - break; + return ctx->sq_sqes; case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; @@ -3329,11 +3431,22 @@ static void *io_uring_validate_mmap_request(struct file *file, return ptr; } +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + #ifdef CONFIG_MMU static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { + struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned long pfn; void *ptr; @@ -3341,6 +3454,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) if (IS_ERR(ptr)) return PTR_ERR(ptr); + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, + ctx->n_ring_pages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + } + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); } @@ -3630,7 +3753,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_mem_alloc(size); + rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); else rings = io_rings_map(ctx, p->cq_off.user_addr, size); @@ -3655,7 +3778,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, } if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_mem_alloc(size); + ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); else ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dbd9a2b870eb..75230d914007 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -70,6 +70,8 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, -- cgit v1.2.3 From 09fc75e0c035a2cabb8caa15cec6e85159dd94f0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2024 14:10:40 -0600 Subject: io_uring: use vmap() for ring mapping This is the last holdout which does odd page checking, convert it to vmap just like what is done for the non-mmap path. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 38 +++++++++----------------------------- 1 file changed, 9 insertions(+), 29 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fba68c37a77d..ef8f1c6ee253 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -63,7 +63,6 @@ #include #include #include -#include #include #include #include @@ -2657,7 +2656,7 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, struct page **page_array; unsigned int nr_pages; void *page_addr; - int ret, i, pinned; + int ret, pinned; *npages = 0; @@ -2679,34 +2678,13 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, goto free_pages; } - page_addr = page_address(page_array[0]); - for (i = 0; i < nr_pages; i++) { - ret = -EINVAL; - - /* - * Can't support mapping user allocated ring memory on 32-bit - * archs where it could potentially reside in highmem. Just - * fail those with -EINVAL, just like we did on kernels that - * didn't support this feature. - */ - if (PageHighMem(page_array[i])) - goto free_pages; - - /* - * No support for discontig pages for now, should either be a - * single normal page, or a huge page. Later on we can add - * support for remapping discontig pages, for now we will - * just fail them with EINVAL. - */ - if (page_address(page_array[i]) != page_addr) - goto free_pages; - page_addr += PAGE_SIZE; + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); + if (page_addr) { + *pages = page_array; + *npages = nr_pages; + return page_addr; } - - *pages = page_array; - *npages = nr_pages; - return page_to_virt(page_array[0]); - + ret = -ENOMEM; free_pages: io_pages_free(&page_array, pinned > 0 ? pinned : 0); return ERR_PTR(ret); @@ -2736,6 +2714,8 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->n_ring_pages = 0; io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); ctx->n_sqe_pages = 0; + vunmap(ctx->rings); + vunmap(ctx->sq_sqes); } ctx->rings = NULL; -- cgit v1.2.3 From 1943f96b3816e0f0d3d6686374d6e1d617c8b42c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2024 14:58:14 -0600 Subject: io_uring: unify io_pin_pages() Move it into io_uring.c where it belongs, and use it in there as well rather than have two implementations of this. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 61 ++++++++++++++++++++++++++++++++++++----------------- io_uring/rsrc.c | 36 ------------------------------- 2 files changed, 42 insertions(+), 55 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ef8f1c6ee253..0ef418faac33 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2650,33 +2650,57 @@ static void io_pages_free(struct page ***pages, int npages) *pages = NULL; } +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, unsigned long uaddr, size_t size) { struct page **page_array; unsigned int nr_pages; void *page_addr; - int ret, pinned; *npages = 0; if (uaddr & (PAGE_SIZE - 1) || !size) return ERR_PTR(-EINVAL); - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > USHRT_MAX) - return ERR_PTR(-EINVAL); - page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!page_array) - return ERR_PTR(-ENOMEM); - - - pinned = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - page_array); - if (pinned != nr_pages) { - ret = (pinned < 0) ? pinned : -EFAULT; - goto free_pages; - } + nr_pages = 0; + page_array = io_pin_pages(uaddr, size, &nr_pages); + if (IS_ERR(page_array)) + return page_array; page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); if (page_addr) { @@ -2684,10 +2708,9 @@ static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, *npages = nr_pages; return page_addr; } - ret = -ENOMEM; -free_pages: - io_pages_free(&page_array, pinned > 0 ? pinned : 0); - return ERR_PTR(ret); + + io_pages_free(&page_array, nr_pages); + return ERR_PTR(-ENOMEM); } static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 7b8a056f98ed..8a34181c97ab 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -870,42 +870,6 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct page **pages = NULL; - int ret; - - end = (ubuf + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = ubuf >> PAGE_SHIFT; - nr_pages = end - start; - WARN_ON(!nr_pages); - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - mmap_read_lock(current->mm); - ret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); - mmap_read_unlock(current->mm); - - /* success, mapped all pages */ - if (ret == nr_pages) { - *npages = nr_pages; - return pages; - } - - /* partial map, or didn't map anything */ - if (ret >= 0) { - /* if we did partial map, release any pages we did get */ - if (ret) - unpin_user_pages(pages, ret); - ret = -EFAULT; - } - kvfree(pages); - return ERR_PTR(ret); -} - static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage) -- cgit v1.2.3 From e270bfd22a2a10d1cfbaddf23e79b6d0b405d21e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 10:42:27 -0600 Subject: io_uring/kbuf: vmap pinned buffer ring This avoids needing to care about HIGHMEM, and it makes the buffer indexing easier as both ring provided buffer methods are now virtually mapped in a contigious fashion. Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3aa16e27f509..4289f4a92693 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -146,15 +147,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BL_EMPTY; head &= bl->mask; - /* mmaped buffers are always contig */ - if (bl->is_mmap || head < IO_BUFFER_LIST_BUF_PER_PAGE) { - buf = &br->bufs[head]; - } else { - int off = head & (IO_BUFFER_LIST_BUF_PER_PAGE - 1); - int index = head / IO_BUFFER_LIST_BUF_PER_PAGE; - buf = page_address(bl->buf_pages[index]); - buf += off; - } + buf = &br->bufs[head]; if (*len == 0 || *len > buf->len) *len = buf->len; req->flags |= REQ_F_BUFFER_RING; @@ -241,6 +234,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, for (j = 0; j < bl->buf_nr_pages; j++) unpin_user_page(bl->buf_pages[j]); kvfree(bl->buf_pages); + vunmap(bl->buf_ring); bl->buf_pages = NULL; bl->buf_nr_pages = 0; } @@ -498,9 +492,9 @@ err: static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_uring_buf_ring *br; + struct io_uring_buf_ring *br = NULL; + int nr_pages, ret, i; struct page **pages; - int i, nr_pages; pages = io_pin_pages(reg->ring_addr, flex_array_size(br, bufs, reg->ring_entries), @@ -508,18 +502,12 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, if (IS_ERR(pages)) return PTR_ERR(pages); - /* - * Apparently some 32-bit boxes (ARM) will return highmem pages, - * which then need to be mapped. We could support that, but it'd - * complicate the code and slowdown the common cases quite a bit. - * So just error out, returning -EINVAL just like we did on kernels - * that didn't support mapped buffer rings. - */ - for (i = 0; i < nr_pages; i++) - if (PageHighMem(pages[i])) - goto error_unpin; + br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!br) { + ret = -ENOMEM; + goto error_unpin; + } - br = page_address(pages[0]); #ifdef SHM_COLOUR /* * On platforms that have specific aliasing requirements, SHM_COLOUR @@ -530,8 +518,10 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, * should use IOU_PBUF_RING_MMAP instead, and liburing will handle * this transparently. */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) + if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { + ret = -EINVAL; goto error_unpin; + } #endif bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; @@ -543,7 +533,8 @@ error_unpin: for (i = 0; i < nr_pages; i++) unpin_user_page(pages[i]); kvfree(pages); - return -EINVAL; + vunmap(br); + return ret; } /* -- cgit v1.2.3 From 87585b05757dc70545efb434669708d276125559 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 Mar 2024 20:24:21 -0600 Subject: io_uring/kbuf: use vm_insert_pages() for mmap'ed pbuf ring Rather than use remap_pfn_range() for this and manually free later, switch to using vm_insert_page() and have it Just Work. This requires a bit of effort on the mmap lookup side, as the ctx uring_lock isn't held, which otherwise protects buffer_lists from being torn down, and it's not safe to grab from mmap context that would introduce an ABBA deadlock between the mmap lock and the ctx uring_lock. Instead, lookup the buffer_list under RCU, as the the list is RCU freed already. Use the existing reference count to determine whether it's possible to safely grab a reference to it (eg if it's not zero already), and drop that reference when done with the mapping. If the mmap reference is the last one, the buffer_list and the associated memory can go away, since the vma insertion has references to the inserted pages at that point. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 - io_uring/io_uring.c | 58 +++++------------- io_uring/io_uring.h | 6 +- io_uring/kbuf.c | 134 ++++++++--------------------------------- io_uring/kbuf.h | 3 +- 5 files changed, 47 insertions(+), 157 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ef45b8bd1b35..d34c8433caf9 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -372,9 +372,6 @@ struct io_ring_ctx { struct list_head io_buffers_cache; - /* deferred free list, protected by ->uring_lock */ - struct hlist_head io_buf_list; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0ef418faac33..50e859da59e1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -303,7 +303,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - INIT_HLIST_HEAD(&ctx->io_buf_list); ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, sizeof(struct io_rsrc_node)); ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, @@ -2598,15 +2597,15 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void io_pages_unmap(void *ptr, struct page ***pages, - unsigned short *npages) +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) { bool do_vunmap = false; if (!ptr) return; - if (*npages) { + if (put_pages && *npages) { struct page **to_free = *pages; int i; @@ -2628,14 +2627,6 @@ static void io_pages_unmap(void *ptr, struct page ***pages, *npages = 0; } -void io_mem_free(void *ptr) -{ - if (!ptr) - return; - - folio_put(virt_to_folio(ptr)); -} - static void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; @@ -2730,8 +2721,10 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages); + io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, + true); + io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, + true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; @@ -2788,8 +2781,8 @@ err: return ERR_PTR(-ENOMEM); } -static void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; struct page **pages; @@ -2819,17 +2812,6 @@ done: return ret; } -void *io_mem_alloc(size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP; - void *ret; - - ret = (void *) __get_free_pages(gfp, get_order(size)); - if (ret) - return ret; - return ERR_PTR(-ENOMEM); -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -2926,7 +2908,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } io_rings_free(ctx); - io_kbuf_mmap_list_free(ctx); percpu_ref_exit(&ctx->refs); free_uid(ctx->user); @@ -3396,10 +3377,8 @@ static void *io_uring_validate_mmap_request(struct file *file, { struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; - struct page *page; - void *ptr; - switch (offset & IORING_OFF_MMAP_MASK) { + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: /* Don't allow mmap if the ring was setup without it */ @@ -3414,6 +3393,7 @@ static void *io_uring_validate_mmap_request(struct file *file, case IORING_OFF_PBUF_RING: { struct io_buffer_list *bl; unsigned int bgid; + void *ptr; bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); @@ -3421,17 +3401,11 @@ static void *io_uring_validate_mmap_request(struct file *file, return bl; ptr = bl->buf_ring; io_put_bl(ctx, bl); - break; + return ptr; } - default: - return ERR_PTR(-EINVAL); } - page = virt_to_head_page(ptr); - if (sz > page_size(page)) - return ERR_PTR(-EINVAL); - - return ptr; + return ERR_PTR(-EINVAL); } int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, @@ -3450,7 +3424,6 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned long pfn; void *ptr; ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); @@ -3465,10 +3438,11 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) case IORING_OFF_SQES: return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); } - pfn = virt_to_phys(ptr) >> PAGE_SHIFT; - return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); + return -EINVAL; } static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 75230d914007..dec996a1c789 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -109,8 +109,10 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -void *io_mem_alloc(size_t size); -void io_mem_free(void *ptr); +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size); +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages); enum { IO_EVENTFD_OP_SIGNAL_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 4289f4a92693..820ac599d003 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -32,25 +32,12 @@ struct io_provide_buf { __u16 bid; }; -struct io_buf_free { - struct hlist_node list; - void *mem; - size_t size; - int inuse; -}; - -static inline struct io_buffer_list *__io_buffer_get_list(struct io_ring_ctx *ctx, - unsigned int bgid) -{ - return xa_load(&ctx->io_bl_xa, bgid); -} - static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { lockdep_assert_held(&ctx->uring_lock); - return __io_buffer_get_list(ctx, bgid); + return xa_load(&ctx->io_bl_xa, bgid); } static int io_buffer_add_list(struct io_ring_ctx *ctx, @@ -191,24 +178,6 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } -/* - * Mark the given mapped range as free for reuse - */ -static void io_kbuf_mark_free(struct io_ring_ctx *ctx, struct io_buffer_list *bl) -{ - struct io_buf_free *ibf; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - if (bl->buf_ring == ibf->mem) { - ibf->inuse = 0; - return; - } - } - - /* can't happen... */ - WARN_ON_ONCE(1); -} - static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { @@ -220,23 +189,16 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_buf_ring) { i = bl->buf_ring->tail - bl->head; - if (bl->is_mmap) { - /* - * io_kbuf_list_free() will free the page(s) at - * ->release() time. - */ - io_kbuf_mark_free(ctx, bl); - bl->buf_ring = NULL; - bl->is_mmap = 0; - } else if (bl->buf_nr_pages) { + if (bl->buf_nr_pages) { int j; - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - kvfree(bl->buf_pages); - vunmap(bl->buf_ring); - bl->buf_pages = NULL; - bl->buf_nr_pages = 0; + if (!bl->is_mmap) { + for (j = 0; j < bl->buf_nr_pages; j++) + unpin_user_page(bl->buf_pages[j]); + } + io_pages_unmap(bl->buf_ring, &bl->buf_pages, + &bl->buf_nr_pages, bl->is_mmap); + bl->is_mmap = 0; } /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); @@ -537,63 +499,18 @@ error_unpin: return ret; } -/* - * See if we have a suitable region that we can reuse, rather than allocate - * both a new io_buf_free and mem region again. We leave it on the list as - * even a reused entry will need freeing at ring release. - */ -static struct io_buf_free *io_lookup_buf_free_entry(struct io_ring_ctx *ctx, - size_t ring_size) -{ - struct io_buf_free *ibf, *best = NULL; - size_t best_dist; - - hlist_for_each_entry(ibf, &ctx->io_buf_list, list) { - size_t dist; - - if (ibf->inuse || ibf->size < ring_size) - continue; - dist = ibf->size - ring_size; - if (!best || dist < best_dist) { - best = ibf; - if (!dist) - break; - best_dist = dist; - } - } - - return best; -} - static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { - struct io_buf_free *ibf; size_t ring_size; - void *ptr; ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - /* Reuse existing entry, if we can */ - ibf = io_lookup_buf_free_entry(ctx, ring_size); - if (!ibf) { - ptr = io_mem_alloc(ring_size); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - /* Allocate and store deferred free entry */ - ibf = kmalloc(sizeof(*ibf), GFP_KERNEL_ACCOUNT); - if (!ibf) { - io_mem_free(ptr); - return -ENOMEM; - } - ibf->mem = ptr; - ibf->size = ring_size; - hlist_add_head(&ibf->list, &ctx->io_buf_list); - } - ibf->inuse = 1; - bl->buf_ring = ibf->mem; + bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); + if (!bl->buf_ring) + return -ENOMEM; + bl->is_buf_ring = 1; bl->is_mmap = 1; return 0; @@ -741,18 +658,19 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, return ERR_PTR(-EINVAL); } -/* - * Called at or after ->release(), free the mmap'ed buffers that we used - * for memory mapped provided buffer rings. - */ -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx) +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) { - struct io_buf_free *ibf; - struct hlist_node *tmp; + struct io_ring_ctx *ctx = file->private_data; + loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; + struct io_buffer_list *bl; + int bgid, ret; - hlist_for_each_entry_safe(ibf, tmp, &ctx->io_buf_list, list) { - hlist_del(&ibf->list); - io_mem_free(ibf->mem); - kfree(ibf); - } + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return PTR_ERR(bl); + + ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); + io_put_bl(ctx, bl); + return ret; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index df365b8860cf..53c141d9a8b2 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -55,8 +55,6 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); - void __io_put_kbuf(struct io_kiocb *req, unsigned issue_flags); bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); @@ -64,6 +62,7 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid); +int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { -- cgit v1.2.3 From 18595c0a58ae29ac6a996c5b664610119b73182d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 Mar 2024 15:01:03 -0600 Subject: io_uring: use unpin_user_pages() where appropriate There are a few cases of open-rolled loops around unpin_user_page(), use the generic helper instead. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 4 +--- io_uring/kbuf.c | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 50e859da59e1..37066ea98f0b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2630,13 +2630,11 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, static void io_pages_free(struct page ***pages, int npages) { struct page **page_array = *pages; - int i; if (!page_array) return; - for (i = 0; i < npages; i++) - unpin_user_page(page_array[i]); + unpin_user_pages(page_array, npages); kvfree(page_array); *pages = NULL; } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 820ac599d003..10d5c98c1c56 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -455,8 +455,8 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, struct io_buffer_list *bl) { struct io_uring_buf_ring *br = NULL; - int nr_pages, ret, i; struct page **pages; + int nr_pages, ret; pages = io_pin_pages(reg->ring_addr, flex_array_size(br, bufs, reg->ring_entries), @@ -492,8 +492,7 @@ static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, bl->is_mmap = 0; return 0; error_unpin: - for (i = 0; i < nr_pages; i++) - unpin_user_page(pages[i]); + unpin_user_pages(pages, nr_pages); kvfree(pages); vunmap(br); return ret; -- cgit v1.2.3 From f15ed8b4d0ce2c0831232ff85117418740f0c529 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Mar 2024 14:59:09 -0600 Subject: io_uring: move mapping/allocation helpers to a separate file Move the related code from io_uring.c into memmap.c. No functional changes in this patch, just cleaning it up a bit now that the full transition is done. Signed-off-by: Jens Axboe --- io_uring/Makefile | 3 +- io_uring/io_uring.c | 327 +------------------------------------------------- io_uring/io_uring.h | 9 -- io_uring/kbuf.c | 1 + io_uring/memmap.c | 336 ++++++++++++++++++++++++++++++++++++++++++++++++++++ io_uring/memmap.h | 25 ++++ io_uring/rsrc.c | 1 + 7 files changed, 367 insertions(+), 335 deletions(-) create mode 100644 io_uring/memmap.c create mode 100644 io_uring/memmap.h diff --git a/io_uring/Makefile b/io_uring/Makefile index bd7c692a6a7c..fc1b23c524e8 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -8,7 +8,8 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ xattr.o nop.o fs.o splice.o sync.o \ msg_ring.o advise.o openclose.o \ epoll.o statx.o timeout.o fdinfo.o \ - cancel.o waitid.o register.o truncate.o + cancel.o waitid.o register.o \ + truncate.o memmap.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 37066ea98f0b..cc5fa4e1b344 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "futex.h" #include "napi.h" #include "uring_cmd.h" +#include "memmap.h" #include "timeout.h" #include "poll.h" @@ -2597,111 +2598,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages) -{ - bool do_vunmap = false; - - if (!ptr) - return; - - if (put_pages && *npages) { - struct page **to_free = *pages; - int i; - - /* - * Only did vmap for the non-compound multiple page case. - * For the compound page, we just need to put the head. - */ - if (PageCompound(to_free[0])) - *npages = 1; - else if (*npages > 1) - do_vunmap = true; - for (i = 0; i < *npages; i++) - put_page(to_free[i]); - } - if (do_vunmap) - vunmap(ptr); - kvfree(*pages); - *pages = NULL; - *npages = 0; -} - -static void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - -struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) -{ - unsigned long start, end, nr_pages; - struct page **pages; - int ret; - - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - start = uaddr >> PAGE_SHIFT; - nr_pages = end - start; - if (WARN_ON_ONCE(!nr_pages)) - return ERR_PTR(-EINVAL); - - pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, - pages); - /* success, mapped all pages */ - if (ret == nr_pages) { - *npages = nr_pages; - return pages; - } - - /* partial map, or didn't map anything */ - if (ret >= 0) { - /* if we did partial map, release any pages we did get */ - if (ret) - unpin_user_pages(pages, ret); - ret = -EFAULT; - } - kvfree(pages); - return ERR_PTR(ret); -} - -static void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; - - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; - } - - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); -} - static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size_t size) { @@ -2736,80 +2632,6 @@ static void io_rings_free(struct io_ring_ctx *ctx) ctx->sq_sqes = NULL; } -static void *io_mem_alloc_compound(struct page **pages, int nr_pages, - size_t size, gfp_t gfp) -{ - struct page *page; - int i, order; - - order = get_order(size); - if (order > MAX_PAGE_ORDER) - return ERR_PTR(-ENOMEM); - else if (order) - gfp |= __GFP_COMP; - - page = alloc_pages(gfp, order); - if (!page) - return ERR_PTR(-ENOMEM); - - for (i = 0; i < nr_pages; i++) - pages[i] = page + i; - - return page_address(page); -} - -static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, - gfp_t gfp) -{ - void *ret; - int i; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(gfp); - if (!pages[i]) - goto err; - } - - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (ret) - return ret; -err: - while (i--) - put_page(pages[i]); - return ERR_PTR(-ENOMEM); -} - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - struct page **pages; - int nr_pages; - void *ret; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) - goto done; - - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) { -done: - *out_pages = pages; - *npages = nr_pages; - return ret; - } - - kvfree(pages); - *out_pages = NULL; - *npages = 0; - return ret; -} - static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, unsigned int cq_entries, size_t *sq_offset) { @@ -3370,149 +3192,6 @@ void __io_uring_cancel(bool cancel_all) io_uring_cancel_generic(cancel_all, NULL); } -static void *io_uring_validate_mmap_request(struct file *file, - loff_t pgoff, size_t sz) -{ - struct io_ring_ctx *ctx = file->private_data; - loff_t offset = pgoff << PAGE_SHIFT; - - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - return ctx->rings; - case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - return ctx->sq_sqes; - case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; - unsigned int bgid; - void *ptr; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - io_put_bl(ctx, bl); - return ptr; - } - } - - return ERR_PTR(-EINVAL); -} - -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages) -{ - unsigned long nr_pages = npages; - - vm_flags_set(vma, VM_DONTEXPAND); - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); -} - -#ifdef CONFIG_MMU - -static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct io_ring_ctx *ctx = file->private_data; - size_t sz = vma->vm_end - vma->vm_start; - long offset = vma->vm_pgoff << PAGE_SHIFT; - void *ptr; - - ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - switch (offset & IORING_OFF_MMAP_MASK) { - case IORING_OFF_SQ_RING: - case IORING_OFF_CQ_RING: - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, - ctx->n_ring_pages); - case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); - case IORING_OFF_PBUF_RING: - return io_pbuf_mmap(file, vma); - } - - return -EINVAL; -} - -static unsigned long io_uring_mmu_get_unmapped_area(struct file *filp, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - /* - * Do not allow to map to user-provided address to avoid breaking the - * aliasing rules. Userspace is not able to guess the offset address of - * kernel kmalloc()ed memory area. - */ - if (addr) - return -EINVAL; - - ptr = io_uring_validate_mmap_request(filp, pgoff, len); - if (IS_ERR(ptr)) - return -ENOMEM; - - /* - * Some architectures have strong cache aliasing requirements. - * For such architectures we need a coherent mapping which aliases - * kernel memory *and* userspace memory. To achieve that: - * - use a NULL file pointer to reference physical memory, and - * - use the kernel virtual address of the shared io_uring context - * (instead of the userspace-provided address, which has to be 0UL - * anyway). - * - use the same pgoff which the get_unmapped_area() uses to - * calculate the page colouring. - * For architectures without such aliasing requirements, the - * architecture will return any suitable mapping because addr is 0. - */ - filp = NULL; - flags |= MAP_SHARED; - pgoff = 0; /* has been translated to ptr above */ -#ifdef SHM_COLOUR - addr = (uintptr_t) ptr; - pgoff = addr >> PAGE_SHIFT; -#else - addr = 0UL; -#endif - return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); -} - -#else /* !CONFIG_MMU */ - -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) -{ - return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; -} - -static unsigned int io_uring_nommu_mmap_capabilities(struct file *file) -{ - return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; -} - -static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - void *ptr; - - ptr = io_uring_validate_mmap_request(file, pgoff, len); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - - return (unsigned long) ptr; -} - -#endif /* !CONFIG_MMU */ - static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) { if (flags & IORING_ENTER_EXT_ARG) { @@ -3695,11 +3374,9 @@ out: static const struct file_operations io_uring_fops = { .release = io_uring_release, .mmap = io_uring_mmap, + .get_unmapped_area = io_uring_get_unmapped_area, #ifndef CONFIG_MMU - .get_unmapped_area = io_uring_nommu_get_unmapped_area, .mmap_capabilities = io_uring_nommu_mmap_capabilities, -#else - .get_unmapped_area = io_uring_mmu_get_unmapped_area, #endif .poll = io_uring_poll, #ifdef CONFIG_PROC_FS diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index dec996a1c789..1eb65324792a 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -69,10 +69,6 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages); - struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -109,11 +105,6 @@ bool __io_alloc_req_refill(struct io_ring_ctx *ctx); bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, bool cancel_all); -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size); -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages); - enum { IO_EVENTFD_OP_SIGNAL_BIT, IO_EVENTFD_OP_FREE_BIT, diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 10d5c98c1c56..896770e93980 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -15,6 +15,7 @@ #include "io_uring.h" #include "opdef.h" #include "kbuf.h" +#include "memmap.h" #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) diff --git a/io_uring/memmap.c b/io_uring/memmap.c new file mode 100644 index 000000000000..523d982af2b0 --- /dev/null +++ b/io_uring/memmap.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "memmap.h" +#include "kbuf.h" + +static void *io_mem_alloc_compound(struct page **pages, int nr_pages, + size_t size, gfp_t gfp) +{ + struct page *page; + int i, order; + + order = get_order(size); + if (order > MAX_PAGE_ORDER) + return ERR_PTR(-ENOMEM); + else if (order) + gfp |= __GFP_COMP; + + page = alloc_pages(gfp, order); + if (!page) + return ERR_PTR(-ENOMEM); + + for (i = 0; i < nr_pages; i++) + pages[i] = page + i; + + return page_address(page); +} + +static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, + gfp_t gfp) +{ + void *ret; + int i; + + for (i = 0; i < nr_pages; i++) { + pages[i] = alloc_page(gfp); + if (!pages[i]) + goto err; + } + + ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (ret) + return ret; +err: + while (i--) + put_page(pages[i]); + return ERR_PTR(-ENOMEM); +} + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + struct page **pages; + int nr_pages; + void *ret; + + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) + goto done; + + ret = io_mem_alloc_single(pages, nr_pages, size, gfp); + if (!IS_ERR(ret)) { +done: + *out_pages = pages; + *npages = nr_pages; + return ret; + } + + kvfree(pages); + *out_pages = NULL; + *npages = 0; + return ret; +} + +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages) +{ + bool do_vunmap = false; + + if (!ptr) + return; + + if (put_pages && *npages) { + struct page **to_free = *pages; + int i; + + /* + * Only did vmap for the non-compound multiple page case. + * For the compound page, we just need to put the head. + */ + if (PageCompound(to_free[0])) + *npages = 1; + else if (*npages > 1) + do_vunmap = true; + for (i = 0; i < *npages; i++) + put_page(to_free[i]); + } + if (do_vunmap) + vunmap(ptr); + kvfree(*pages); + *pages = NULL; + *npages = 0; +} + +void io_pages_free(struct page ***pages, int npages) +{ + struct page **page_array = *pages; + + if (!page_array) + return; + + unpin_user_pages(page_array, npages); + kvfree(page_array); + *pages = NULL; +} + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) +{ + unsigned long start, end, nr_pages; + struct page **pages; + int ret; + + end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + nr_pages = end - start; + if (WARN_ON_ONCE(!nr_pages)) + return ERR_PTR(-EINVAL); + + pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); + + ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, + pages); + /* success, mapped all pages */ + if (ret == nr_pages) { + *npages = nr_pages; + return pages; + } + + /* partial map, or didn't map anything */ + if (ret >= 0) { + /* if we did partial map, release any pages we did get */ + if (ret) + unpin_user_pages(pages, ret); + ret = -EFAULT; + } + kvfree(pages); + return ERR_PTR(ret); +} + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size) +{ + struct page **page_array; + unsigned int nr_pages; + void *page_addr; + + *npages = 0; + + if (uaddr & (PAGE_SIZE - 1) || !size) + return ERR_PTR(-EINVAL); + + nr_pages = 0; + page_array = io_pin_pages(uaddr, size, &nr_pages); + if (IS_ERR(page_array)) + return page_array; + + page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); + if (page_addr) { + *pages = page_array; + *npages = nr_pages; + return page_addr; + } + + io_pages_free(&page_array, nr_pages); + return ERR_PTR(-ENOMEM); +} + +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) +{ + struct io_ring_ctx *ctx = file->private_data; + loff_t offset = pgoff << PAGE_SHIFT; + + switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->rings; + case IORING_OFF_SQES: + /* Don't allow mmap if the ring was setup without it */ + if (ctx->flags & IORING_SETUP_NO_MMAP) + return ERR_PTR(-EINVAL); + return ctx->sq_sqes; + case IORING_OFF_PBUF_RING: { + struct io_buffer_list *bl; + unsigned int bgid; + void *ptr; + + bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + bl = io_pbuf_get_bl(ctx, bgid); + if (IS_ERR(bl)) + return bl; + ptr = bl->buf_ring; + io_put_bl(ctx, bl); + return ptr; + } + } + + return ERR_PTR(-EINVAL); +} + +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages) +{ + unsigned long nr_pages = npages; + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); +} + +#ifdef CONFIG_MMU + +__cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct io_ring_ctx *ctx = file->private_data; + size_t sz = vma->vm_end - vma->vm_start; + long offset = vma->vm_pgoff << PAGE_SHIFT; + void *ptr; + + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + switch (offset & IORING_OFF_MMAP_MASK) { + case IORING_OFF_SQ_RING: + case IORING_OFF_CQ_RING: + return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, + ctx->n_ring_pages); + case IORING_OFF_SQES: + return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, + ctx->n_sqe_pages); + case IORING_OFF_PBUF_RING: + return io_pbuf_mmap(file, vma); + } + + return -EINVAL; +} + +unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + /* + * Do not allow to map to user-provided address to avoid breaking the + * aliasing rules. Userspace is not able to guess the offset address of + * kernel kmalloc()ed memory area. + */ + if (addr) + return -EINVAL; + + ptr = io_uring_validate_mmap_request(filp, pgoff, len); + if (IS_ERR(ptr)) + return -ENOMEM; + + /* + * Some architectures have strong cache aliasing requirements. + * For such architectures we need a coherent mapping which aliases + * kernel memory *and* userspace memory. To achieve that: + * - use a NULL file pointer to reference physical memory, and + * - use the kernel virtual address of the shared io_uring context + * (instead of the userspace-provided address, which has to be 0UL + * anyway). + * - use the same pgoff which the get_unmapped_area() uses to + * calculate the page colouring. + * For architectures without such aliasing requirements, the + * architecture will return any suitable mapping because addr is 0. + */ + filp = NULL; + flags |= MAP_SHARED; + pgoff = 0; /* has been translated to ptr above */ +#ifdef SHM_COLOUR + addr = (uintptr_t) ptr; + pgoff = addr >> PAGE_SHIFT; +#else + addr = 0UL; +#endif + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +#else /* !CONFIG_MMU */ + +int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; +} + +unsigned int io_uring_nommu_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; +} + +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + void *ptr; + + ptr = io_uring_validate_mmap_request(file, pgoff, len); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + return (unsigned long) ptr; +} + +#endif /* !CONFIG_MMU */ diff --git a/io_uring/memmap.h b/io_uring/memmap.h new file mode 100644 index 000000000000..5cec5b7ac49a --- /dev/null +++ b/io_uring/memmap.h @@ -0,0 +1,25 @@ +#ifndef IO_URING_MEMMAP_H +#define IO_URING_MEMMAP_H + +struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +void io_pages_free(struct page ***pages, int npages); +int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, + struct page **pages, int npages); + +void *io_pages_map(struct page ***out_pages, unsigned short *npages, + size_t size); +void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, + bool put_pages); + +void *__io_uaddr_map(struct page ***pages, unsigned short *npages, + unsigned long uaddr, size_t size); + +#ifndef CONFIG_MMU +unsigned int io_uring_nommu_mmap_capabilities(struct file *file); +#endif +unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); +int io_uring_mmap(struct file *file, struct vm_area_struct *vma); + +#endif diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 8a34181c97ab..65417c9553b1 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -16,6 +16,7 @@ #include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" +#include "memmap.h" struct io_rsrc_update { struct file *file; -- cgit v1.2.3 From 1da2f311ba53a1ee106a637cf17aba05d2acc1ff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Mar 2024 17:19:45 -0600 Subject: io_uring: fix warnings on shadow variables There are a few of those: io_uring/fdinfo.c:170:16: warning: declaration shadows a local variable [-Wshadow] 170 | struct file *f = io_file_from_index(&ctx->file_table, i); | ^ io_uring/fdinfo.c:53:67: note: previous declaration is here 53 | __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) | ^ io_uring/cancel.c:187:25: warning: declaration shadows a local variable [-Wshadow] 187 | struct io_uring_task *tctx = node->task->io_uring; | ^ io_uring/cancel.c:166:31: note: previous declaration is here 166 | struct io_uring_task *tctx, | ^ io_uring/register.c:371:25: warning: declaration shadows a local variable [-Wshadow] 371 | struct io_uring_task *tctx = node->task->io_uring; | ^ io_uring/register.c:312:24: note: previous declaration is here 312 | struct io_uring_task *tctx = NULL; | ^ and a simple cleanup gets rid of them. For the fdinfo case, make a distinction between the file being passed in (for the ring), and the registered files we iterate. For the other two cases, just get rid of shadowed variable, there's no reason to have a new one. Signed-off-by: Jens Axboe --- io_uring/cancel.c | 4 +--- io_uring/fdinfo.c | 4 ++-- io_uring/register.c | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/io_uring/cancel.c b/io_uring/cancel.c index acfcdd7f059a..a6e58a20efdd 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -184,9 +184,7 @@ static int __io_async_cancel(struct io_cancel_data *cd, io_ring_submit_lock(ctx, issue_flags); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - - ret = io_async_cancel_one(tctx, cd); + ret = io_async_cancel_one(node->task->io_uring, cd); if (ret != -ENOENT) { if (!all) break; diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 8d444dd1b0a7..b1e0e0d85349 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -50,9 +50,9 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, * Caller holds a reference to the file already, we don't need to do * anything else to get an extra reference. */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) { - struct io_ring_ctx *ctx = f->private_data; + struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; diff --git a/io_uring/register.c b/io_uring/register.c index 99c37775f974..ef8c908346a4 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -368,8 +368,7 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, /* now propagate the restriction to all registered users */ list_for_each_entry(node, &ctx->tctx_list, ctx_node) { - struct io_uring_task *tctx = node->task->io_uring; - + tctx = node->task->io_uring; if (WARN_ON_ONCE(!tctx->io_wq)) continue; -- cgit v1.2.3 From 285207f67c9bcad1d9168993f175d6d88ce310f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Mar 2024 17:22:27 -0600 Subject: io_uring/kbuf: remove dead define We no longer use IO_BUFFER_LIST_BUF_PER_PAGE, kill it. Signed-off-by: Jens Axboe --- io_uring/kbuf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 896770e93980..3846a055df44 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -17,8 +17,6 @@ #include "kbuf.h" #include "memmap.h" -#define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) - /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -- cgit v1.2.3 From f39130004d3a9155d113284c19b5a7c2eccb43fe Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 5 Apr 2024 16:50:02 +0100 Subject: io_uring: kill dead code in io_req_complete_post Since commit 8f6c829491fe ("io_uring: remove struct io_tw_state::locked"), io_req_complete_post() is only called from io-wq submit work, where the request reference is guaranteed to be grabbed and won't drop to zero in io_req_complete_post(). Kill the dead code, meantime add req_ref_put() to put the reference. Cc: Pavel Begunkov Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Signed-by: Pavel Begunkov Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d8297e2046553153e763a52574f0e0f4d512f86.1712331455.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 37 ++----------------------------------- io_uring/refs.h | 7 +++++++ 2 files changed, 9 insertions(+), 35 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index cc5fa4e1b344..8bd5db2056ee 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -928,7 +928,6 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_rsrc_node *rsrc_node = NULL; /* * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires @@ -945,42 +944,10 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) if (!io_fill_cqe_req(ctx, req)) io_req_cqe_overflow(req); } - - /* - * If we're the last reference to this request, add to our locked - * free_list cache. - */ - if (req_ref_put_and_test(req)) { - if (req->flags & IO_REQ_LINK_FLAGS) { - if (req->flags & IO_DISARM_MASK) - io_disarm_next(req); - if (req->link) { - io_req_task_queue(req->link); - req->link = NULL; - } - } - io_put_kbuf_comp(req); - if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS)) - io_clean_op(req); - io_put_file(req); - - rsrc_node = req->rsrc_node; - /* - * Selected buffer deallocation in io_clean_op() assumes that - * we don't hold ->completion_lock. Clean them here to avoid - * deadlocks. - */ - io_put_task_remote(req->task); - wq_list_add_head(&req->comp_list, &ctx->locked_free_list); - ctx->locked_free_nr++; - } io_cq_unlock_post(ctx); - if (rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - io_put_rsrc_node(ctx, rsrc_node); - io_ring_submit_unlock(ctx, issue_flags); - } + /* called from io-wq submit work only, the ref won't drop to zero */ + req_ref_put(req); } void io_req_defer_failed(struct io_kiocb *req, s32 res) diff --git a/io_uring/refs.h b/io_uring/refs.h index 1336de3f2a30..63982ead9f7d 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -33,6 +33,13 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_dec(&req->refs); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { -- cgit v1.2.3 From de96e9ae69a134c009a6d9a7ca182fa67067ecac Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 5 Apr 2024 16:50:03 +0100 Subject: io_uring: turn implicit assumptions into a warning io_req_complete_post() is now io-wq only and shouldn't be used outside of it, i.e. it relies that io-wq holds a ref for the request as explained in a comment below. Let's add a warning to enforce the assumption and make sure nobody would try to do anything weird. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1013b60c35d431d0698cafbc53c06f5917348c20.1712331455.git.asml.silence@gmail.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8bd5db2056ee..8078d6944411 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -929,6 +929,13 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + /* + * All execution paths but io-wq use the deferred completions by + * passing IO_URING_F_COMPLETE_DEFER and thus should not end up here. + */ + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_IOWQ))) + return; + /* * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. @@ -946,7 +953,10 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) } io_cq_unlock_post(ctx); - /* called from io-wq submit work only, the ref won't drop to zero */ + /* + * We don't free the request here because we know it's called from + * io-wq only, which holds a reference, so it cannot be the last put. + */ req_ref_put(req); } -- cgit v1.2.3 From d9713ad3fa227726a0b4d544c5a4cdd393c1933e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 5 Apr 2024 16:50:04 +0100 Subject: io_uring: remove async request cache io_req_complete_post() was a sole user of ->locked_free_list, but since we just gutted the function, the cache is not used anymore and can be removed. ->locked_free_list served as an asynhronous counterpart of the main request (i.e. struct io_kiocb) cache for all unlocked cases like io-wq. Now they're all forced to be completed into the main cache directly, off of the normal completion path or via io_free_req(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7bffccd213e370abd4de480e739d8b08ab6c1326.1712331455.git.asml.silence@gmail.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 4 ---- io_uring/io_uring.c | 22 ---------------------- 2 files changed, 26 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index d34c8433caf9..c47f412cf18e 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -346,10 +346,6 @@ struct io_ring_ctx { spinlock_t completion_lock; - /* IRQ completion list, under ->completion_lock */ - unsigned int locked_free_nr; - struct io_wq_work_list locked_free_list; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct io_hash_table cancel_table; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 8078d6944411..e291f227a1a0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -334,7 +334,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; - INIT_WQ_LIST(&ctx->locked_free_list); INIT_HLIST_HEAD(&ctx->waitid_list); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); @@ -988,15 +987,6 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } -static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_submit_state *state) -{ - spin_lock(&ctx->completion_lock); - wq_list_splice(&ctx->locked_free_list, &state->free_list); - ctx->locked_free_nr = 0; - spin_unlock(&ctx->completion_lock); -} - /* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. @@ -1010,17 +1000,6 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) void *reqs[IO_REQ_ALLOC_BATCH]; int ret; - /* - * If we have more than a batch's worth of requests in our IRQ side - * locked cache, grab the lock and move them over to our submission - * side cache. - */ - if (data_race(ctx->locked_free_nr) > IO_COMPL_BATCH) { - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); - if (!io_req_cache_empty(ctx)) - return true; - } - ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* @@ -2654,7 +2633,6 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) int nr = 0; mutex_lock(&ctx->uring_lock); - io_flush_cached_locked_reqs(ctx, &ctx->submit_state); while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); -- cgit v1.2.3 From c29006a2456bc9c2aea09e4a8958b4d9a7dfcb5a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 5 Apr 2024 16:50:05 +0100 Subject: io_uring: remove io_req_put_rsrc_locked() io_req_put_rsrc_locked() is a weird shim function around io_req_put_rsrc(). All calls to io_req_put_rsrc() require holding ->uring_lock, so we can just use it directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a195bc78ac3d2c6fbaea72976e982fe51e50ecdd.1712331455.git.asml.silence@gmail.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 5 ++--- io_uring/rsrc.h | 6 ------ 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e291f227a1a0..9d389bd89006 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1449,10 +1449,9 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); - - io_req_put_rsrc_locked(req, ctx); - + io_put_rsrc_node(ctx, req->rsrc_node); io_put_task(req->task); + node = req->comp_list.next; io_req_add_to_cache(req, ctx); } while (node); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 83c079a707f8..c032ca3436ca 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -83,12 +83,6 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node io_rsrc_node_ref_zero(node); } -static inline void io_req_put_rsrc_locked(struct io_kiocb *req, - struct io_ring_ctx *ctx) -{ - io_put_rsrc_node(ctx, req->rsrc_node); -} - static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { -- cgit v1.2.3 From bbbef3e9d2a82754bd72a86ba1352cfc17bf31a7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 7 Apr 2024 21:27:59 +0800 Subject: io_uring: return void from io_put_kbuf_comp() The only caller doesn't handle the return value of io_put_kbuf_comp(), so change its return type into void. Also follow Jens's suggestion to rename it as io_put_kbuf_drop(). Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240407132759.4056167-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/kbuf.h | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9d389bd89006..b7984442dd84 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -381,7 +381,7 @@ static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { spin_lock(&req->ctx->completion_lock); - io_put_kbuf_comp(req); + io_kbuf_drop(req); spin_unlock(&req->ctx->completion_lock); } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 53c141d9a8b2..5a9635ee0217 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -120,18 +120,14 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, } } -static inline unsigned int io_put_kbuf_comp(struct io_kiocb *req) +static inline void io_kbuf_drop(struct io_kiocb *req) { - unsigned int ret; - lockdep_assert_held(&req->ctx->completion_lock); if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return 0; + return; - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); - return ret; } static inline unsigned int io_put_kbuf(struct io_kiocb *req, -- cgit v1.2.3 From 998632921d28a6d360d2a6d7ffcfbcf4f6f17fc2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 00:54:55 +0100 Subject: io_uring/net: merge ubuf sendzc callbacks Splitting io_tx_ubuf_callback_ext from io_tx_ubuf_callback is a pre mature optimisation that doesn't give us much. Merge the functions into one and reclaim some simplicity back. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d44d68f6f7add33a0dcf0b7fd7b73c2dc543604f.1712534031.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index d3e703c37aba..47ff2da8c421 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -30,36 +30,24 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); - if (refcount_dec_and_test(&uarg->refcnt)) - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); -} - -static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) -{ - struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); - if (nd->zc_report) { if (success && !nd->zc_used && skb) WRITE_ONCE(nd->zc_used, true); else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } - io_tx_ubuf_callback(skb, uarg, success); + + if (refcount_dec_and_test(&uarg->refcnt)) + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } void io_notif_set_extended(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { - nd->account_pages = 0; - nd->zc_report = false; - nd->zc_used = false; - nd->zc_copied = false; - nd->uarg.callback = io_tx_ubuf_callback_ext; - notif->io_task_work.func = io_notif_complete_tw_ext; - } + nd->zc_used = false; + nd->zc_copied = false; + notif->io_task_work.func = io_notif_complete_tw_ext; } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -79,6 +67,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); + nd->zc_report = false; + nd->account_pages = 0; nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; nd->uarg.callback = io_tx_ubuf_callback; refcount_set(&nd->uarg.refcnt, 1); -- cgit v1.2.3 From 6b7f864bb70591b1ba8f538c13de2a8396bfec8a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 00:54:56 +0100 Subject: io_uring/net: get rid of io_notif_complete_tw_ext io_notif_complete_tw_ext() can be removed and combined with io_notif_complete_tw to make it simpler without sacrificing anything. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/025a124a5e20e2474a57e2f04f16c422eb83063c.1712534031.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 10 +++++----- io_uring/notif.c | 18 +++++------------- io_uring/notif.h | 6 ++++-- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index d0abc5689066..688fee3a0c53 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1018,8 +1018,11 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (zc->flags & ~IO_ZC_FLAGS_VALID) return -EINVAL; if (zc->flags & IORING_SEND_ZC_REPORT_USAGE) { - io_notif_set_extended(notif); - io_notif_to_data(notif)->zc_report = true; + struct io_notif_data *nd = io_notif_to_data(notif); + + nd->zc_report = true; + nd->zc_used = false; + nd->zc_copied = false; } } @@ -1128,7 +1131,6 @@ static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) return ret; kmsg->msg.sg_from_iter = io_sg_from_iter; } else { - io_notif_set_extended(sr->notif); ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); if (unlikely(ret)) return ret; @@ -1217,8 +1219,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; - io_notif_set_extended(sr->notif); - sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; diff --git a/io_uring/notif.c b/io_uring/notif.c index 47ff2da8c421..b561bd763435 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,12 +9,12 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) +void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; - if (nd->zc_report && (nd->zc_copied || !nd->zc_used)) + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; if (nd->account_pages && ctx->user) { @@ -37,17 +37,10 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, WRITE_ONCE(nd->zc_copied, true); } - if (refcount_dec_and_test(&uarg->refcnt)) + if (refcount_dec_and_test(&uarg->refcnt)) { + notif->io_task_work.func = io_notif_tw_complete; __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); -} - -void io_notif_set_extended(struct io_kiocb *notif) -{ - struct io_notif_data *nd = io_notif_to_data(notif); - - nd->zc_used = false; - nd->zc_copied = false; - notif->io_task_work.func = io_notif_complete_tw_ext; + } } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -64,7 +57,6 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) notif->task = current; io_get_task_refs(1); notif->rsrc_node = NULL; - notif->io_task_work.func = io_req_task_complete; nd = io_notif_to_data(notif); nd->zc_report = false; diff --git a/io_uring/notif.h b/io_uring/notif.h index 86d32bd9f856..52e124a9957c 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -20,7 +20,7 @@ struct io_notif_data { }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); -void io_notif_set_extended(struct io_kiocb *notif); +void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { @@ -33,8 +33,10 @@ static inline void io_notif_flush(struct io_kiocb *notif) struct io_notif_data *nd = io_notif_to_data(notif); /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) + if (refcount_dec_and_test(&nd->uarg.refcnt)) { + notif->io_task_work.func = io_notif_tw_complete; __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + } } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) -- cgit v1.2.3 From d285da7dbd3b3cc9b4cf822039a87ca4e4106ecf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 8 Apr 2024 00:54:57 +0100 Subject: io_uring/net: set MSG_ZEROCOPY for sendzc in advance We can set MSG_ZEROCOPY at the preparation step, do it so we don't have to care about it later in the issue callback. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c2c22aaa577624977f045979a6db2b9fb2e5648c.1712534031.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index 688fee3a0c53..a1da8a2ebf15 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1050,7 +1050,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; @@ -1167,7 +1167,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return ret; } - msg_flags = zc->msg_flags | MSG_ZEROCOPY; + msg_flags = zc->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) msg_flags |= MSG_DONTWAIT; if (msg_flags & MSG_WAITALL) @@ -1229,7 +1229,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - flags = sr->msg_flags | MSG_ZEROCOPY; + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; if (flags & MSG_WAITALL) -- cgit v1.2.3 From 8c9a6f549e65912825e31dc1e0e3f7995984649d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 9 Apr 2024 14:05:53 -0700 Subject: io_uring: separate header for exported net bits We're exporting some io_uring bits to networking, e.g. for implementing a net callback for io_uring cmds, but we don't want to expose more than needed. Add a separate header for networking. Signed-off-by: Pavel Begunkov Signed-off-by: David Wei Link: https://lore.kernel.org/r/20240409210554.1878789-1-dw@davidwei.uk Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 6 ------ include/linux/io_uring/net.h | 18 ++++++++++++++++++ io_uring/uring_cmd.c | 1 + net/socket.c | 2 +- 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 include/linux/io_uring/net.h diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 68ed6697fece..e123d5e17b52 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -11,7 +11,6 @@ void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) @@ -45,11 +44,6 @@ static inline const char *io_uring_get_opcode(u8 opcode) { return ""; } -static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} static inline bool io_is_uring_fops(struct file *file) { return false; diff --git a/include/linux/io_uring/net.h b/include/linux/io_uring/net.h new file mode 100644 index 000000000000..b58f39fed4d5 --- /dev/null +++ b/include/linux/io_uring/net.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_NET_H +#define _LINUX_IO_URING_NET_H + +struct io_uring_cmd; + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); + +#else +static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + return -EOPNOTSUPP; +} +#endif + +#endif diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 334d31dd6628..21ac5fb2d5f0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/net/socket.c b/net/socket.c index e5f3af49a8b6..01a71ae10c35 100644 --- a/net/socket.c +++ b/net/socket.c @@ -88,7 +88,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.3 From a5bff51850c8d533f3696d45749ab169dd49f8dd Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Apr 2024 02:26:51 +0100 Subject: io_uring: unexport io_req_cqe_overflow() There are no users of io_req_cqe_overflow() apart from io_uring.c, make it static. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/f4295eb2f9eb98d5db38c0578f57f0b86bfe0d8c.1712708261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b7984442dd84..454d1d7695f6 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -819,7 +819,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, return true; } -void io_req_cqe_overflow(struct io_kiocb *req) +static void io_req_cqe_overflow(struct io_kiocb *req) { io_cqring_event_overflow(req->ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1eb65324792a..624ca9076a50 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -62,7 +62,6 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) } bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); -void io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); -- cgit v1.2.3 From e45ec969d17acb0a1bea78c9c6c4403fceccd599 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Apr 2024 02:26:52 +0100 Subject: io_uring: remove extra SQPOLL overflow flush c1edbf5f081be ("io_uring: flag SQPOLL busy condition to userspace") added an extra overflowed CQE flush in the SQPOLL submission path due to backpressure, which was later removed. Remove the flush and let io_cqring_wait() / iopoll handle it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2a83b0724ca6ca9d16c7d79a51b77c81876b2e39.1712708261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 454d1d7695f6..885f701983d7 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3238,8 +3238,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx); - if (unlikely(ctx->sq_data->thread == NULL)) { ret = -EOWNERDEAD; goto out; -- cgit v1.2.3 From 408024b959275ba52c233c647901cacfc8d5a226 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Apr 2024 02:26:53 +0100 Subject: io_uring: open code io_cqring_overflow_flush() There is only one caller of io_cqring_overflow_flush(), open code it Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a1fecd56d9dba923ed8d4d159727fa939d3baa2a.1712708261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 885f701983d7..740bfec10fde 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -726,12 +726,6 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) -{ - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) - io_cqring_do_overflow_flush(ctx); -} - /* can be called by any task */ static void io_put_task_remote(struct task_struct *task) { @@ -2452,8 +2446,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, if (!llist_empty(&ctx->work_llist)) io_run_local_work(ctx, min_events); io_run_task_work(); - io_cqring_overflow_flush(ctx); - /* if user messes with these they will just get an early return */ + + if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) + io_cqring_do_overflow_flush(ctx); if (__io_cqring_events_user(ctx) >= min_events) return 0; -- cgit v1.2.3 From 8d09a88ef9d3cb7d21d45c39b7b7c31298d23998 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Apr 2024 02:26:54 +0100 Subject: io_uring: always lock __io_cqring_overflow_flush Conditional locking is never great, in case of __io_cqring_overflow_flush(), which is a slow path, it's not justified. Don't handle IOPOLL separately, always grab uring_lock for overflow flushing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/162947df299aa12693ac4b305dacedab32ec7976.1712708261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 740bfec10fde..92ac9c0fc597 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -673,6 +673,8 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) struct io_overflow_cqe *ocqe; LIST_HEAD(list); + lockdep_assert_held(&ctx->uring_lock); + spin_lock(&ctx->completion_lock); list_splice_init(&ctx->cq_overflow_list, &list); clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); @@ -689,6 +691,8 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { size_t cqe_size = sizeof(struct io_uring_cqe); + lockdep_assert_held(&ctx->uring_lock); + if (__io_cqring_events(ctx) == ctx->cq_entries) return; @@ -718,12 +722,9 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { - /* iopoll syncs against uring_lock, not completion_lock */ - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_lock(&ctx->uring_lock); + mutex_lock(&ctx->uring_lock); __io_cqring_overflow_flush(ctx); - if (ctx->flags & IORING_SETUP_IOPOLL) - mutex_unlock(&ctx->uring_lock); + mutex_unlock(&ctx->uring_lock); } /* can be called by any task */ @@ -1522,6 +1523,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) unsigned int nr_events = 0; unsigned long check_cq; + lockdep_assert_held(&ctx->uring_lock); + if (!io_allowed_run_tw(ctx)) return -EEXIST; -- cgit v1.2.3 From 6b231248e97fc37d4205449d48747b5a3b4c2fcc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 10 Apr 2024 02:26:55 +0100 Subject: io_uring: consolidate overflow flushing Consolidate __io_cqring_overflow_flush and io_cqring_overflow_kill() into a single function as it once was, it's easier to work with it this way. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/986b42c35e76a6be7aa0cdcda0a236a2222da3a7.1712708261.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 92ac9c0fc597..c4419eef7e63 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -668,26 +668,7 @@ static void io_cq_unlock_post(struct io_ring_ctx *ctx) io_commit_cqring_flush(ctx); } -static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) -{ - struct io_overflow_cqe *ocqe; - LIST_HEAD(list); - - lockdep_assert_held(&ctx->uring_lock); - - spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->cq_overflow_list, &list); - clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); - spin_unlock(&ctx->completion_lock); - - while (!list_empty(&list)) { - ocqe = list_first_entry(&list, struct io_overflow_cqe, list); - list_del(&ocqe->list); - kfree(ocqe); - } -} - -static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) { size_t cqe_size = sizeof(struct io_uring_cqe); @@ -704,11 +685,14 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) struct io_uring_cqe *cqe; struct io_overflow_cqe *ocqe; - if (!io_get_cqe_overflow(ctx, &cqe, true)) - break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - memcpy(cqe, &ocqe->cqe, cqe_size); + + if (!dying) { + if (!io_get_cqe_overflow(ctx, &cqe, true)) + break; + memcpy(cqe, &ocqe->cqe, cqe_size); + } list_del(&ocqe->list); kfree(ocqe); } @@ -720,10 +704,16 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) io_cq_unlock_post(ctx); } +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) +{ + if (ctx->rings) + __io_cqring_overflow_flush(ctx, true); +} + static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); mutex_unlock(&ctx->uring_lock); } @@ -1531,7 +1521,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx, false); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. -- cgit v1.2.3 From 4d0f4a5413490391c6cd16407a0f71b51700a68a Mon Sep 17 00:00:00 2001 From: Ruyi Zhang Date: Thu, 11 Apr 2024 13:59:53 +0800 Subject: io_uring/timeout: remove duplicate initialization of the io_timeout list. In the __io_timeout_prep function, the io_timeout list is initialized twice, removing the meaningless second initialization. Signed-off-by: Ruyi Zhang Link: https://lore.kernel.org/r/20240411055953.2029218-1-ruyi.zhang@samsung.com Signed-off-by: Jens Axboe --- io_uring/timeout.c | 1 - 1 file changed, 1 deletion(-) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 3458ca550b83..1c9bf07499b1 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -537,7 +537,6 @@ static int __io_timeout_prep(struct io_kiocb *req, if (data->ts.tv_sec < 0 || data->ts.tv_nsec < 0) return -EINVAL; - INIT_LIST_HEAD(&timeout->list); data->mode = io_translate_timeout_mode(flags); hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); -- cgit v1.2.3 From 686b56cbeedc9f4c72f9bb781918194a9a3e8334 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Apr 2024 13:16:20 -0600 Subject: io_uring: ensure overflow entries are dropped when ring is exiting A previous consolidation cleanup missed handling the case where the ring is dying, and __io_cqring_overflow_flush() doesn't flush entries if the CQ ring is already full. This is fine for the normal CQE overflow flushing, but if the ring is going away, we need to flush everything, even if it means simply freeing the overflown entries. Fixes: 6c948ec44b29 ("io_uring: consolidate overflow flushing") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c4419eef7e63..3c9087f37c43 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -674,7 +674,8 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) lockdep_assert_held(&ctx->uring_lock); - if (__io_cqring_events(ctx) == ctx->cq_entries) + /* don't abort if we're dying, entries must get freed */ + if (!dying && __io_cqring_events(ctx) == ctx->cq_entries) return; if (ctx->flags & IORING_SETUP_CQE32) -- cgit v1.2.3 From 7e58d0af5a587e74f46f55b91a0197f750eba78c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Apr 2024 13:50:11 +0100 Subject: io_uring/notif: refactor io_tx_ubuf_complete() Flip the dec_and_test "if", that makes the function extension easier in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/43939e2b04dff03bff5d7227c98afedf951227b3.1713185320.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index b561bd763435..452c255de04a 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -37,10 +37,11 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, WRITE_ONCE(nd->zc_copied, true); } - if (refcount_dec_and_test(&uarg->refcnt)) { - notif->io_task_work.func = io_notif_tw_complete; - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); - } + if (!refcount_dec_and_test(&uarg->refcnt)) + return; + + notif->io_task_work.func = io_notif_tw_complete; + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) -- cgit v1.2.3 From 2e730d8de45768810df4a6859cd64c5387cf0131 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Apr 2024 13:50:12 +0100 Subject: io_uring/notif: remove ctx var from io_notif_tw_complete We don't need ctx in the hottest path, i.e. registered buffers, let's get it only when we need it. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e7345e268ffaeaf79b4c8f3a5d019d6a87a3d1f1.1713185320.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index 452c255de04a..3485437b207d 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -12,13 +12,12 @@ void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); - struct io_ring_ctx *ctx = notif->ctx; if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; - if (nd->account_pages && ctx->user) { - __io_unaccount_mem(ctx->user, nd->account_pages); + if (nd->account_pages && notif->ctx->user) { + __io_unaccount_mem(notif->ctx->user, nd->account_pages); nd->account_pages = 0; } io_req_task_complete(notif, ts); -- cgit v1.2.3 From d6e295061f239bee48c9e49313f68042121e21c2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 Apr 2024 13:50:13 +0100 Subject: io_uring/notif: shrink account_pages to u32 ->account_pages is the number of pages we account against the user derived from unsigned len, it definitely fits into unsigned, which saves some space in struct io_notif_data. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19f2687fcb36daa74d86f4a27bfb3d35cffec318.1713185320.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/io_uring/notif.h b/io_uring/notif.h index 52e124a9957c..2e25a2fc77d1 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -13,7 +13,8 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; - unsigned long account_pages; + + unsigned account_pages; bool zc_report; bool zc_used; bool zc_copied; -- cgit v1.2.3 From c4ce0ab27646f4206a9eb502d6fe45cb080e1cae Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Mar 2024 07:38:38 -0600 Subject: io_uring/sqpoll: work around a potential audit memory leak kmemleak complains that there's a memory leak related to connect handling: unreferenced object 0xffff0001093bdf00 (size 128): comm "iou-sqp-455", pid 457, jiffies 4294894164 hex dump (first 32 bytes): 02 00 fa ea 7f 00 00 01 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace (crc 2e481b1a): [<00000000c0a26af4>] kmemleak_alloc+0x30/0x38 [<000000009c30bb45>] kmalloc_trace+0x228/0x358 [<000000009da9d39f>] __audit_sockaddr+0xd0/0x138 [<0000000089a93e34>] move_addr_to_kernel+0x1a0/0x1f8 [<000000000b4e80e6>] io_connect_prep+0x1ec/0x2d4 [<00000000abfbcd99>] io_submit_sqes+0x588/0x1e48 [<00000000e7c25e07>] io_sq_thread+0x8a4/0x10e4 [<00000000d999b491>] ret_from_fork+0x10/0x20 which can can happen if: 1) The command type does something on the prep side that triggers an audit call. 2) The thread hasn't done any operations before this that triggered an audit call inside ->issue(), where we have audit_uring_entry() and audit_uring_exit(). Work around this by issuing a blanket NOP operation before the SQPOLL does anything. Signed-off-by: Jens Axboe --- io_uring/sqpoll.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 3983708cef5b..554c7212aa46 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -291,6 +291,14 @@ static int io_sq_thread(void *data) sqd->sq_cpu = raw_smp_processor_id(); } + /* + * Force audit context to get setup, in case we do prep side async + * operations that would trigger an audit call before any issue side + * audit has been done. + */ + audit_uring_entry(IORING_OP_NOP); + audit_uring_exit(true, 0); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; -- cgit v1.2.3 From 068c27e32e51e94e4a9eb30ae85f4097a3602980 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 15 Apr 2024 22:10:53 -0400 Subject: io-wq: write next_work before dropping acct_lock Commit 361aee450c6e ("io-wq: add intermediate work step between pending list and active work") closed a race between a cancellation and the work being removed from the wq for execution. To ensure the request is always reachable by the cancellation, we need to move it within the wq lock, which also synchronizes the cancellation. But commit 42abc95f05bf ("io-wq: decouple work_list protection from the big wqe->lock") replaced the wq lock here and accidentally reintroduced the race by releasing the acct_lock too early. In other words: worker | cancellation work = io_get_next_work() | raw_spin_unlock(&acct->lock); | | | io_acct_cancel_pending_work | io_wq_worker_cancel() worker->next_work = work Using acct_lock is still enough since we synchronize on it on io_acct_cancel_pending_work. Fixes: 42abc95f05bf ("io-wq: decouple work_list protection from the big wqe->lock") Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240416021054.3940-2-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 522196dfb0ff..318ed067dbf6 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -564,10 +564,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * clear the stalled flag. */ work = io_get_next_work(acct, worker); - raw_spin_unlock(&acct->lock); if (work) { - __io_worker_busy(wq, worker); - /* * Make sure cancelation can find this, even before * it becomes the active work. That avoids a window @@ -578,9 +575,15 @@ static void io_worker_handle_work(struct io_wq_acct *acct, raw_spin_lock(&worker->lock); worker->next_work = work; raw_spin_unlock(&worker->lock); - } else { - break; } + + raw_spin_unlock(&acct->lock); + + if (!work) + break; + + __io_worker_busy(wq, worker); + io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 24c3fc5c75c5b9d471783b4a4958748243828613 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 15 Apr 2024 22:10:54 -0400 Subject: io-wq: Drop intermediate step between pending list and active work next_work is only used to make the work visible for cancellation. Instead, we can just directly write to cur_work before dropping the acct_lock and avoid the extra hop. Signed-off-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20240416021054.3940-3-krisman@suse.de Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 318ed067dbf6..d7fc6f6d4477 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -51,7 +51,6 @@ struct io_worker { struct io_wq *wq; struct io_wq_work *cur_work; - struct io_wq_work *next_work; raw_spinlock_t lock; struct completion ref_done; @@ -539,7 +538,6 @@ static void io_assign_current_work(struct io_worker *worker, raw_spin_lock(&worker->lock); worker->cur_work = work; - worker->next_work = NULL; raw_spin_unlock(&worker->lock); } @@ -573,7 +571,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * current work item for this worker. */ raw_spin_lock(&worker->lock); - worker->next_work = work; + worker->cur_work = work; raw_spin_unlock(&worker->lock); } @@ -1008,8 +1006,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) * may dereference the passed in work. */ raw_spin_lock(&worker->lock); - if (__io_wq_worker_cancel(worker, match, worker->cur_work) || - __io_wq_worker_cancel(worker, match, worker->next_work)) + if (__io_wq_worker_cancel(worker, match, worker->cur_work)) match->nr_running++; raw_spin_unlock(&worker->lock); -- cgit v1.2.3 From df604d2ad480fcf7b39767280c9093e13b1de952 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Apr 2024 09:23:55 -0600 Subject: io_uring/rw: ensure retry condition isn't lost A previous commit removed the checking on whether or not it was possible to retry a request, since it's now possible to retry any of them. This would previously have caused the request to have been ended with an error, but now the retry condition can simply get lost instead. Cleanup the retry handling and always just punt it to task_work, which will queue it with io-wq appropriately. Reported-by: Changhui Zhong Tested-by: Ming Lei Fixes: cca6571381a0 ("io_uring/rw: cleanup retry path") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 +++++++++++++ io_uring/io_uring.h | 1 + io_uring/rw.c | 13 ++++++------- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3c9087f37c43..c67ae6e36c4f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -527,6 +527,19 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } +static void io_tw_requeue_iowq(struct io_kiocb *req, struct io_tw_state *ts) +{ + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); +} + +void io_tw_queue_iowq(struct io_kiocb *req) +{ + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + req->io_task_work.func = io_tw_requeue_iowq; + io_req_task_work_add(req); +} + static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 624ca9076a50..b83a719c5443 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,6 +75,7 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); +void io_tw_queue_iowq(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index 3134a6ece1be..4fed829fe97c 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -455,7 +455,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) * current cycle. */ io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + io_tw_queue_iowq(req); return true; } req_set_fail(req); @@ -521,7 +521,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + io_tw_queue_iowq(req); return; } req->cqe.res = res; @@ -839,7 +839,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(rw, &io->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; + if (req->flags & REQ_F_REISSUE) + return IOU_ISSUE_SKIP_COMPLETE; /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -1034,10 +1035,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } + if (req->flags & REQ_F_REISSUE) + return IOU_ISSUE_SKIP_COMPLETE; /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just -- cgit v1.2.3 From 3e747dedd47b6250390abfc08dc0aa4817d3c052 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 25 Feb 2024 12:52:39 -0700 Subject: io_uring/net: add generic multishot retry helper This is just moving io_recv_prep_retry() higher up so it can get used for sends as well, and rename it to be generically useful for both sends and receives. Signed-off-by: Jens Axboe --- io_uring/net.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index a1da8a2ebf15..dc310f0bfe4c 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -185,6 +185,17 @@ static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, return 0; } +static inline void io_mshot_prep_retry(struct io_kiocb *req, + struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + req->flags &= ~REQ_F_BL_EMPTY; + sr->done_io = 0; + sr->len = 0; /* get from the provided buffer */ + req->buf_index = sr->buf_group; +} + #ifdef CONFIG_COMPAT static int io_compat_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -658,17 +669,6 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_recvmsg_prep_setup(req); } -static inline void io_recv_prep_retry(struct io_kiocb *req, - struct io_async_msghdr *kmsg) -{ - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - - req->flags &= ~REQ_F_BL_EMPTY; - sr->done_io = 0; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; -} - /* * Finishes io_recv and io_recvmsg. * @@ -694,7 +694,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - io_recv_prep_retry(req, kmsg); + io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) -- cgit v1.2.3 From ac5f71a3d9d7eb540f6bf7e794eb4a3e4c3f11dd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 19 Feb 2024 10:46:44 -0700 Subject: io_uring/net: add provided buffer support for IORING_OP_SEND It's pretty trivial to wire up provided buffer support for the send side, just like how it's done the receive side. This enables setting up a buffer ring that an application can use to push pending sends to, and then have a send pick a buffer from that ring. One of the challenges with async IO and networking sends is that you can get into reordering conditions if you have more than one inflight at the same time. Consider the following scenario where everything is fine: 1) App queues sendA for socket1 2) App queues sendB for socket1 3) App does io_uring_submit() 4) sendA is issued, completes successfully, posts CQE 5) sendB is issued, completes successfully, posts CQE All is fine. Requests are always issued in-order, and both complete inline as most sends do. However, if we're flooding socket1 with sends, the following could also result from the same sequence: 1) App queues sendA for socket1 2) App queues sendB for socket1 3) App does io_uring_submit() 4) sendA is issued, socket1 is full, poll is armed for retry 5) Space frees up in socket1, this triggers sendA retry via task_work 6) sendB is issued, completes successfully, posts CQE 7) sendA is retried, completes successfully, posts CQE Now we've sent sendB before sendA, which can make things unhappy. If both sendA and sendB had been using provided buffers, then it would look as follows instead: 1) App queues dataA for sendA, queues sendA for socket1 2) App queues dataB for sendB queues sendB for socket1 3) App does io_uring_submit() 4) sendA is issued, socket1 is full, poll is armed for retry 5) Space frees up in socket1, this triggers sendA retry via task_work 6) sendB is issued, picks first buffer (dataA), completes successfully, posts CQE (which says "I sent dataA") 7) sendA is retried, picks first buffer (dataB), completes successfully, posts CQE (which says "I sent dataB") Now we've sent the data in order, and everybody is happy. It's worth noting that this also opens the door for supporting multishot sends, as provided buffers would be a prerequisite for that. Those can trigger either when new buffers are added to the outgoing ring, or (if stalled due to lack of space) when space frees up in the socket. Signed-off-by: Jens Axboe --- io_uring/net.c | 25 ++++++++++++++++++++----- io_uring/opdef.c | 1 + 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/io_uring/net.c b/io_uring/net.c index dc310f0bfe4c..13685d133582 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -364,10 +364,12 @@ static int io_send_setup(struct io_kiocb *req) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = sr->addr_len; } - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; - + if (!io_do_buffer_select(req)) { + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret < 0)) + return ret; + } return 0; } @@ -480,6 +482,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; + unsigned int cflags; unsigned flags; int min_ret = 0; int ret; @@ -492,6 +495,17 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; + if (io_do_buffer_select(req)) { + size_t len = sr->len; + void __user *buf; + + buf = io_buffer_select(req, &len, issue_flags); + if (unlikely(!buf)) + return -ENOBUFS; + sr->buf = buf; + sr->len = len; + } + flags = sr->msg_flags; if (issue_flags & IO_URING_F_NONBLOCK) flags |= MSG_DONTWAIT; @@ -521,7 +535,8 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) else if (sr->done_io) ret = sr->done_io; io_req_msg_cleanup(req, issue_flags); - io_req_set_res(req, ret, 0); + cflags = io_put_kbuf(req, issue_flags); + io_req_set_res(req, ret, cflags); return IOU_OK; } diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a16f73938ebb..2de5cca9504e 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -281,6 +281,7 @@ const struct io_issue_def io_issue_defs[] = { .pollout = 1, .audit_skip = 1, .ioprio = 1, + .buffer_select = 1, #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep = io_sendmsg_prep, -- cgit v1.2.3 From 35c8711c8fc4c16ad2749b8314da5829a493e28e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Mar 2024 07:31:52 -0700 Subject: io_uring/kbuf: add helpers for getting/peeking multiple buffers Our provided buffer interface only allows selection of a single buffer. Add an API that allows getting/peeking multiple buffers at the same time. This is only implemented for the ring provided buffers. It could be added for the legacy provided buffers as well, but since it's strongly encouraged to use the new interface, let's keep it simpler and just provide it for the new API. The legacy interface will always just select a single buffer. There are two new main functions: io_buffers_select(), which selects up as many buffers as it can. The caller supplies the iovec array, and io_buffers_select() may allocate a bigger array if the 'out_len' being passed in is non-zero and bigger than what fits in the provided iovec. Buffers grabbed with this helper are permanently assigned. io_buffers_peek(), which works like io_buffers_select(), except they can be recycled, if needed. Callers using either of these functions should call io_put_kbufs() rather than io_put_kbuf() at completion time. The peek interface must be called with the ctx locked from peek to completion. This add a bit state for the request: - REQ_F_BUFFERS_COMMIT, which means that the the buffers have been peeked and should be committed to the buffer ring head when they are put as part of completion. Prior to this, req->buf_list was cleared to NULL when committed. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 + io_uring/kbuf.c | 157 ++++++++++++++++++++++++++++++++++++++++- io_uring/kbuf.h | 53 +++++++++++--- 3 files changed, 201 insertions(+), 12 deletions(-) diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index c47f412cf18e..7a6b190c7da7 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -472,6 +472,7 @@ enum { REQ_F_CAN_POLL_BIT, REQ_F_BL_EMPTY_BIT, REQ_F_BL_NO_RECYCLE_BIT, + REQ_F_BUFFERS_COMMIT_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -550,6 +551,8 @@ enum { REQ_F_BL_EMPTY = IO_REQ_FLAG(REQ_F_BL_EMPTY_BIT), /* don't recycle provided buffers for this request */ REQ_F_BL_NO_RECYCLE = IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT), + /* buffer ring head needs incrementing on put */ + REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 3846a055df44..d2945c9c812b 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -117,6 +117,27 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +static int io_provided_buffers_select(struct io_kiocb *req, size_t *len, + struct io_buffer_list *bl, + struct iovec *iov) +{ + void __user *buf; + + buf = io_provided_buffer_select(req, len, bl); + if (unlikely(!buf)) + return -ENOBUFS; + + iov[0].iov_base = buf; + iov[0].iov_len = *len; + return 0; +} + +static struct io_uring_buf *io_ring_head_to_buf(struct io_uring_buf_ring *br, + __u16 head, __u16 mask) +{ + return &br->bufs[head & mask]; +} + static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -132,11 +153,10 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, if (head + 1 == tail) req->flags |= REQ_F_BL_EMPTY; - head &= bl->mask; - buf = &br->bufs[head]; + buf = io_ring_head_to_buf(br, head, bl->mask); if (*len == 0 || *len > buf->len) *len = buf->len; - req->flags |= REQ_F_BUFFER_RING; + req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; @@ -151,6 +171,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, * the transfer completes (or if we get -EAGAIN and must poll of * retry). */ + req->flags &= ~REQ_F_BUFFERS_COMMIT; req->buf_list = NULL; bl->head++; } @@ -177,6 +198,136 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, return ret; } +/* cap it at a reasonable 256, will be one page even for 4K */ +#define PEEK_MAX_IMPORT 256 + +static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, + struct io_buffer_list *bl) +{ + struct io_uring_buf_ring *br = bl->buf_ring; + struct iovec *iov = arg->iovs; + int nr_iovs = arg->nr_iovs; + __u16 nr_avail, tail, head; + struct io_uring_buf *buf; + + tail = smp_load_acquire(&br->tail); + head = bl->head; + nr_avail = min_t(__u16, tail - head, UIO_MAXIOV); + if (unlikely(!nr_avail)) + return -ENOBUFS; + + buf = io_ring_head_to_buf(br, head, bl->mask); + if (arg->max_len) { + int needed; + + needed = (arg->max_len + buf->len - 1) / buf->len; + needed = min(needed, PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; + } + + /* + * only alloc a bigger array if we know we have data to map, eg not + * a speculative peek operation. + */ + if (arg->mode & KBUF_MODE_EXPAND && nr_avail > nr_iovs && arg->max_len) { + iov = kmalloc_array(nr_avail, sizeof(struct iovec), GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + if (arg->mode & KBUF_MODE_FREE) + kfree(arg->iovs); + arg->iovs = iov; + nr_iovs = nr_avail; + } else if (nr_avail < nr_iovs) { + nr_iovs = nr_avail; + } + + /* set it to max, if not set, so we can use it unconditionally */ + if (!arg->max_len) + arg->max_len = INT_MAX; + + req->buf_index = buf->bid; + do { + /* truncate end piece, if needed */ + if (buf->len > arg->max_len) + buf->len = arg->max_len; + + iov->iov_base = u64_to_user_ptr(buf->addr); + iov->iov_len = buf->len; + iov++; + + arg->out_len += buf->len; + arg->max_len -= buf->len; + if (!arg->max_len) + break; + + buf = io_ring_head_to_buf(br, ++head, bl->mask); + } while (--nr_iovs); + + if (head == tail) + req->flags |= REQ_F_BL_EMPTY; + + req->flags |= REQ_F_BUFFER_RING; + req->buf_list = bl; + return iov - arg->iovs; +} + +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret = -ENOENT; + + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + goto out_unlock; + + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + /* + * Don't recycle these buffers if we need to go through poll. + * Nobody else can use them anyway, and holding on to provided + * buffers for a send/write operation would happen on the app + * side anyway with normal buffers. Besides, we already + * committed them, they cannot be put back in the queue. + */ + if (ret > 0) { + req->flags |= REQ_F_BL_NO_RECYCLE; + req->buf_list->head += ret; + } + } else { + ret = io_provided_buffers_select(req, &arg->out_len, bl, arg->iovs); + } +out_unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; + + lockdep_assert_held(&ctx->uring_lock); + + bl = io_buffer_get_list(ctx, req->buf_index); + if (unlikely(!bl)) + return -ENOENT; + + if (bl->is_buf_ring) { + ret = io_ring_buffers_peek(req, arg, bl); + if (ret > 0) + req->flags |= REQ_F_BUFFERS_COMMIT; + return ret; + } + + /* don't support multiple buffer selections for legacy */ + return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); +} + static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer_list *bl, unsigned nbufs) { diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 5a9635ee0217..b90aca3a57fa 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -41,8 +41,26 @@ struct io_buffer { __u16 bgid; }; +enum { + /* can alloc a bigger vec */ + KBUF_MODE_EXPAND = 1, + /* if bigger vec allocated, free old one */ + KBUF_MODE_FREE = 2, +}; + +struct buf_sel_arg { + struct iovec *iovs; + size_t out_len; + size_t max_len; + int nr_iovs; + int mode; +}; + void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags); +int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, + unsigned int issue_flags); +int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); @@ -75,7 +93,7 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) */ if (req->buf_list) { req->buf_index = req->buf_list->bgid; - req->flags &= ~REQ_F_BUFFER_RING; + req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } return false; @@ -99,11 +117,16 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -static inline void __io_put_kbuf_ring(struct io_kiocb *req) +static inline void __io_put_kbuf_ring(struct io_kiocb *req, int nr) { - if (req->buf_list) { - req->buf_index = req->buf_list->bgid; - req->buf_list->head++; + struct io_buffer_list *bl = req->buf_list; + + if (bl) { + if (req->flags & REQ_F_BUFFERS_COMMIT) { + bl->head += nr; + req->flags &= ~REQ_F_BUFFERS_COMMIT; + } + req->buf_index = bl->bgid; } req->flags &= ~REQ_F_BUFFER_RING; } @@ -112,7 +135,7 @@ static inline void __io_put_kbuf_list(struct io_kiocb *req, struct list_head *list) { if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, 1); } else { req->buf_index = req->kbuf->bgid; list_add(&req->kbuf->list, list); @@ -130,8 +153,8 @@ static inline void io_kbuf_drop(struct io_kiocb *req) __io_put_kbuf_list(req, &req->ctx->io_buffers_comp); } -static inline unsigned int io_put_kbuf(struct io_kiocb *req, - unsigned issue_flags) +static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) { unsigned int ret; @@ -140,9 +163,21 @@ static inline unsigned int io_put_kbuf(struct io_kiocb *req, ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); if (req->flags & REQ_F_BUFFER_RING) - __io_put_kbuf_ring(req); + __io_put_kbuf_ring(req, nbufs); else __io_put_kbuf(req, issue_flags); return ret; } + +static inline unsigned int io_put_kbuf(struct io_kiocb *req, + unsigned issue_flags) +{ + return __io_put_kbufs(req, 1, issue_flags); +} + +static inline unsigned int io_put_kbufs(struct io_kiocb *req, int nbufs, + unsigned issue_flags) +{ + return __io_put_kbufs(req, nbufs, issue_flags); +} #endif -- cgit v1.2.3 From a05d1f625c7aa681d8816bc0f10089289ad07aad Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Mar 2024 13:10:04 -0700 Subject: io_uring/net: support bundles for send If IORING_OP_SEND is used with provided buffers, the caller may also set IORING_RECVSEND_BUNDLE to turn it into a multi-buffer send. The idea is that an application can fill outgoing buffers in a provided buffer group, and then arm a single send that will service them all. Once there are no more buffers to send, or if the requested length has been sent, the request posts a single completion for all the buffers. This only enables it for IORING_OP_SEND, IORING_OP_SENDMSG is coming in a separate patch. However, this patch does do a lot of the prep work that makes wiring up the sendmsg variant pretty trivial. They share the prep side. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 +++ io_uring/net.c | 145 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 137 insertions(+), 17 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index a7f847543a7f..7f583927c908 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -351,11 +351,20 @@ enum io_uring_op { * 0 is reported if zerocopy was actually possible. * IORING_NOTIF_USAGE_ZC_COPIED if data was copied * (at least partially). + * + * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send will + * grab as many buffers from the buffer group ID + * given and send them all. The completion result + * will be the number of buffers send, with the + * starting buffer ID in cqe->flags as per usual + * for provided buffer usage. The buffers will be + * contigious from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) #define IORING_RECVSEND_FIXED_BUF (1U << 2) #define IORING_SEND_ZC_REPORT_USAGE (1U << 3) +#define IORING_RECVSEND_BUNDLE (1U << 4) /* * cqe.res for IORING_CQE_F_NOTIF if diff --git a/io_uring/net.c b/io_uring/net.c index 13685d133582..3e326576254b 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -57,7 +57,7 @@ struct io_sr_msg { struct user_msghdr __user *umsg; void __user *buf; }; - unsigned len; + int len; unsigned done_io; unsigned msg_flags; unsigned nr_multishot_loops; @@ -389,6 +389,8 @@ static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) return ret; } +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) + int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -407,11 +409,20 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~IORING_RECVSEND_POLL_FIRST) + if (sr->flags & ~SENDMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_SENDMSG) + return -EINVAL; + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + sr->msg_flags |= MSG_WAITALL; + sr->buf_group = req->buf_index; + req->buf_list = NULL; + } #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -427,6 +438,79 @@ static void io_req_msg_cleanup(struct io_kiocb *req, io_netmsg_recycle(req, issue_flags); } +/* + * For bundle completions, we need to figure out how many segments we consumed. + * A bundle could be using a single ITER_UBUF if that's all we mapped, or it + * could be using an ITER_IOVEC. If the latter, then if we consumed all of + * the segments, then it's a trivial questiont o answer. If we have residual + * data in the iter, then loop the segments to figure out how much we + * transferred. + */ +static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) +{ + struct iovec *iov; + int nbufs; + + /* no data is always zero segments, and a ubuf is always 1 segment */ + if (ret <= 0) + return 0; + if (iter_is_ubuf(&kmsg->msg.msg_iter)) + return 1; + + iov = kmsg->free_iov; + if (!iov) + iov = &kmsg->fast_iov; + + /* if all data was transferred, it's basic pointer math */ + if (!iov_iter_count(&kmsg->msg.msg_iter)) + return iter_iov(&kmsg->msg.msg_iter) - iov; + + /* short transfer, count segments */ + nbufs = 0; + do { + int this_len = min_t(int, iov[nbufs].iov_len, ret); + + nbufs++; + ret -= this_len; + } while (ret); + + return nbufs; +} + +static inline bool io_send_finish(struct io_kiocb *req, int *ret, + struct io_async_msghdr *kmsg, + unsigned issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + bool bundle_finished = *ret <= 0; + unsigned int cflags; + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) { + cflags = io_put_kbuf(req, issue_flags); + goto finish; + } + + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), issue_flags); + + if (bundle_finished || req->flags & REQ_F_BL_EMPTY) + goto finish; + + /* + * Fill CQE for this receive and see if we should keep trying to + * receive from this socket. + */ + if (io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { + io_mshot_prep_retry(req, kmsg); + return false; + } + + /* Otherwise stop bundle and use the current result. */ +finish: + io_req_set_res(req, *ret, cflags); + *ret = IOU_OK; + return true; +} + int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -482,7 +566,6 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; struct socket *sock; - unsigned int cflags; unsigned flags; int min_ret = 0; int ret; @@ -495,21 +578,47 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; + flags = sr->msg_flags; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + +retry_bundle: if (io_do_buffer_select(req)) { - size_t len = sr->len; - void __user *buf; + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = min_not_zero(sr->len, INT_MAX), + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } - buf = io_buffer_select(req, &len, issue_flags); - if (unlikely(!buf)) - return -ENOBUFS; - sr->buf = buf; - sr->len = len; + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + sr->len = arg.out_len; + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + } } - flags = sr->msg_flags; - if (issue_flags & IO_URING_F_NONBLOCK) - flags |= MSG_DONTWAIT; - if (flags & MSG_WAITALL) + /* + * If MSG_WAITALL is set, or this is a bundle send, then we need + * the full amount. If just bundle is set, if we do a short send + * then we complete the bundle sequence rather than continue on. + */ + if (flags & MSG_WAITALL || sr->flags & IORING_RECVSEND_BUNDLE) min_ret = iov_iter_count(&kmsg->msg.msg_iter); flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; @@ -534,10 +643,12 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) ret += sr->done_io; else if (sr->done_io) ret = sr->done_io; + + if (!io_send_finish(req, &ret, kmsg, issue_flags)) + goto retry_bundle; + io_req_msg_cleanup(req, issue_flags); - cflags = io_put_kbuf(req, issue_flags); - io_req_set_res(req, ret, cflags); - return IOU_OK; + return ret; } static int io_recvmsg_mshot_prep(struct io_kiocb *req, -- cgit v1.2.3 From 2f9c9515bdfde9e4df1f35782284074d3625ff8a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Mar 2024 16:22:04 -0700 Subject: io_uring/net: support bundles for recv If IORING_OP_RECV is used with provided buffers, the caller may also set IORING_RECVSEND_BUNDLE to turn it into a multi-buffer recv. This grabs buffers available and receives into them, posting a single completion for all of it. This can be used with multishot receive as well, or without it. Now that both send and receive support bundles, add a feature flag for it as well. If IORING_FEAT_RECVSEND_BUNDLE is set after registering the ring, then the kernel supports bundles for recv and send. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 15 +++--- io_uring/io_uring.c | 3 +- io_uring/net.c | 116 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 105 insertions(+), 29 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 7f583927c908..f093cb2300d9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -352,13 +352,13 @@ enum io_uring_op { * IORING_NOTIF_USAGE_ZC_COPIED if data was copied * (at least partially). * - * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send will - * grab as many buffers from the buffer group ID - * given and send them all. The completion result - * will be the number of buffers send, with the - * starting buffer ID in cqe->flags as per usual - * for provided buffer usage. The buffers will be - * contigious from the starting buffer ID. + * IORING_RECVSEND_BUNDLE Used with IOSQE_BUFFER_SELECT. If set, send or + * recv will grab as many buffers from the buffer + * group ID given and send them all. The completion + * result will be the number of buffers send, with + * the starting buffer ID in cqe->flags as per + * usual for provided buffer usage. The buffers + * will be contigious from the starting buffer ID. */ #define IORING_RECVSEND_POLL_FIRST (1U << 0) #define IORING_RECV_MULTISHOT (1U << 1) @@ -529,6 +529,7 @@ struct io_uring_params { #define IORING_FEAT_CQE_SKIP (1U << 11) #define IORING_FEAT_LINKED_FILE (1U << 12) #define IORING_FEAT_REG_REG_RING (1U << 13) +#define IORING_FEAT_RECVSEND_BUNDLE (1U << 14) /* * io_uring_register(2) opcodes and arguments diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c67ae6e36c4f..64845634d89f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3583,7 +3583,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | - IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING; + IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | + IORING_FEAT_RECVSEND_BUNDLE; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/io_uring/net.c b/io_uring/net.c index 3e326576254b..51c41d771c50 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -747,7 +747,8 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) return ret; } -#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT) +#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ + IORING_RECVSEND_BUNDLE) int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -761,21 +762,14 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); - if (sr->flags & ~(RECVMSG_FLAGS)) + if (sr->flags & ~RECVMSG_FLAGS) return -EINVAL; sr->msg_flags = READ_ONCE(sqe->msg_flags); if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; if (sr->msg_flags & MSG_ERRQUEUE) req->flags |= REQ_F_CLEAR_POLLIN; - if (sr->flags & IORING_RECV_MULTISHOT) { - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; - if (sr->msg_flags & MSG_WAITALL) - return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) - return -EINVAL; - req->flags |= REQ_F_APOLL_MULTISHOT; + if (req->flags & REQ_F_BUFFER_SELECT) { /* * Store the buffer group for this multishot receive separately, * as if we end up doing an io-wq based issue that selects a @@ -785,6 +779,20 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) * restore it. */ sr->buf_group = req->buf_index; + req->buf_list = NULL; + } + if (sr->flags & IORING_RECV_MULTISHOT) { + if (!(req->flags & REQ_F_BUFFER_SELECT)) + return -EINVAL; + if (sr->msg_flags & MSG_WAITALL) + return -EINVAL; + if (req->opcode == IORING_OP_RECV && sr->len) + return -EINVAL; + req->flags |= REQ_F_APOLL_MULTISHOT; + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { + if (req->opcode == IORING_OP_RECVMSG) + return -EINVAL; } #ifdef CONFIG_COMPAT @@ -805,19 +813,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, struct io_async_msghdr *kmsg, bool mshot_finished, unsigned issue_flags) { + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); unsigned int cflags; - cflags = io_put_kbuf(req, issue_flags); + if (sr->flags & IORING_RECVSEND_BUNDLE) + cflags = io_put_kbufs(req, io_bundle_nbufs(kmsg, *ret), + issue_flags); + else + cflags = io_put_kbuf(req, issue_flags); + if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + /* bundle with no more immediate buffers, we're done */ + if (sr->flags & IORING_RECVSEND_BUNDLE && req->flags & REQ_F_BL_EMPTY) + goto finish; + /* * Fill CQE for this receive and see if we should keep trying to * receive from this socket. */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; io_mshot_prep_retry(req, kmsg); @@ -837,6 +854,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, } /* Finish the request / stop multishot. */ +finish: io_req_set_res(req, *ret, cflags); if (issue_flags & IO_URING_F_MULTISHOT) @@ -1020,6 +1038,69 @@ retry_multishot: return ret; } +static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg, + size_t *len, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + int ret; + + /* + * If the ring isn't locked, then don't use the peek interface + * to grab multiple buffers as we will lock/unlock between + * this selection and posting the buffers. + */ + if (!(issue_flags & IO_URING_F_UNLOCKED) && + sr->flags & IORING_RECVSEND_BUNDLE) { + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .nr_iovs = 1, + .mode = KBUF_MODE_EXPAND, + }; + + if (kmsg->free_iov) { + arg.nr_iovs = kmsg->free_iov_nr; + arg.iovs = kmsg->free_iov; + arg.mode |= KBUF_MODE_FREE; + } + + if (kmsg->msg.msg_inq > 0) + arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + + ret = io_buffers_peek(req, &arg); + if (unlikely(ret < 0)) + return ret; + + /* special case 1 vec, can be a fast path */ + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + sr->len = arg.iovs[0].iov_len; + goto map_ubuf; + } + iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, + arg.out_len); + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + } + } else { + void __user *buf; + + *len = sr->len; + buf = io_buffer_select(req, len, issue_flags); + if (!buf) + return -ENOBUFS; + sr->buf = buf; + sr->len = *len; +map_ubuf: + ret = import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } + + return 0; +} + int io_recv(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -1044,17 +1125,10 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) retry_multishot: if (io_do_buffer_select(req)) { - void __user *buf; - - buf = io_buffer_select(req, &len, issue_flags); - if (!buf) - return -ENOBUFS; - sr->buf = buf; - sr->len = len; - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); + ret = io_recv_buf_select(req, kmsg, &len, issue_flags); if (unlikely(ret)) goto out_free; + sr->buf = NULL; } kmsg->msg.msg_inq = -1; -- cgit v1.2.3 From 7ab4f16f9e2440e797eae88812f800458e5879d2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Apr 2024 12:08:39 +0100 Subject: net: extend ubuf_info callback to ops structure We'll need to associate additional callbacks with ubuf_info, introduce a structure holding ubuf_info callbacks. Apart from a more smarter io_uring notification management introduced in next patches, it can be used to generalise msg_zerocopy_put_abort() and also store ->sg_from_iter, which is currently passed in struct msghdr. Reviewed-by: Jens Axboe Reviewed-by: David Ahern Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/all/a62015541de49c0e2a8a0377a1d5d0a5aeb07016.1713369317.git.asml.silence@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/tap.c | 2 +- drivers/net/tun.c | 2 +- drivers/net/xen-netback/common.h | 5 ++--- drivers/net/xen-netback/interface.c | 2 +- drivers/net/xen-netback/netback.c | 11 ++++++++--- drivers/vhost/net.c | 8 ++++++-- include/linux/skbuff.h | 19 +++++++++++-------- io_uring/notif.c | 18 +++++++++++++----- net/core/skbuff.c | 16 ++++++++++------ 9 files changed, 53 insertions(+), 30 deletions(-) diff --git a/drivers/net/tap.c b/drivers/net/tap.c index 9f0495e8df4d..bfdd3875fe86 100644 --- a/drivers/net/tap.c +++ b/drivers/net/tap.c @@ -754,7 +754,7 @@ static ssize_t tap_get_user(struct tap_queue *q, void *msg_control, skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; - uarg->callback(NULL, uarg, false); + uarg->ops->complete(NULL, uarg, false); } dev_queue_xmit(skb); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 0b3f21cba552..b7401d990680 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1906,7 +1906,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb_zcopy_init(skb, msg_control); } else if (msg_control) { struct ubuf_info *uarg = msg_control; - uarg->callback(NULL, uarg, false); + uarg->ops->complete(NULL, uarg, false); } skb_reset_network_header(skb); diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 1fcbd83f7ff2..17421da139f2 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -390,9 +390,8 @@ bool xenvif_rx_queue_tail(struct xenvif_queue *queue, struct sk_buff *skb); void xenvif_carrier_on(struct xenvif *vif); -/* Callback from stack when TX packet can be released */ -void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf, - bool zerocopy_success); +/* Callbacks from stack when TX packet can be released */ +extern const struct ubuf_info_ops xenvif_ubuf_ops; static inline pending_ring_idx_t nr_pending_reqs(struct xenvif_queue *queue) { diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c index 7cff90aa8d24..65db5f14465f 100644 --- a/drivers/net/xen-netback/interface.c +++ b/drivers/net/xen-netback/interface.c @@ -593,7 +593,7 @@ int xenvif_init_queue(struct xenvif_queue *queue) for (i = 0; i < MAX_PENDING_REQS; i++) { queue->pending_tx_info[i].callback_struct = (struct ubuf_info_msgzc) - { { .callback = xenvif_zerocopy_callback }, + { { .ops = &xenvif_ubuf_ops }, { { .ctx = NULL, .desc = i } } }; queue->grant_tx_handle[i] = NETBACK_INVALID_HANDLE; diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index ef76850d9bcd..bab7e43ea05b 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -1156,7 +1156,7 @@ static int xenvif_handle_frag_list(struct xenvif_queue *queue, struct sk_buff *s uarg = skb_shinfo(skb)->destructor_arg; /* increase inflight counter to offset decrement in callback */ atomic_inc(&queue->inflight_packets); - uarg->callback(NULL, uarg, true); + uarg->ops->complete(NULL, uarg, true); skb_shinfo(skb)->destructor_arg = NULL; /* Fill the skb with the new (local) frags. */ @@ -1278,8 +1278,9 @@ static int xenvif_tx_submit(struct xenvif_queue *queue) return work_done; } -void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, - bool zerocopy_success) +static void xenvif_zerocopy_callback(struct sk_buff *skb, + struct ubuf_info *ubuf_base, + bool zerocopy_success) { unsigned long flags; pending_ring_idx_t index; @@ -1312,6 +1313,10 @@ void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, xenvif_skb_zerocopy_complete(queue); } +const struct ubuf_info_ops xenvif_ubuf_ops = { + .complete = xenvif_zerocopy_callback, +}; + static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue) { struct gnttab_unmap_grant_ref *gop; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index c64ded183f8d..f16279351db5 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -380,7 +380,7 @@ static void vhost_zerocopy_signal_used(struct vhost_net *net, } } -static void vhost_zerocopy_callback(struct sk_buff *skb, +static void vhost_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *ubuf_base, bool success) { struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); @@ -408,6 +408,10 @@ static void vhost_zerocopy_callback(struct sk_buff *skb, rcu_read_unlock_bh(); } +static const struct ubuf_info_ops vhost_ubuf_ops = { + .complete = vhost_zerocopy_complete, +}; + static inline unsigned long busy_clock(void) { return local_clock() >> 10; @@ -879,7 +883,7 @@ static void handle_tx_zerocopy(struct vhost_net *net, struct socket *sock) vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; ubuf->ctx = nvq->ubufs; ubuf->desc = nvq->upend_idx; - ubuf->ubuf.callback = vhost_zerocopy_callback; + ubuf->ubuf.ops = &vhost_ubuf_ops; ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; refcount_set(&ubuf->ubuf.refcnt, 1); msg.msg_control = &ctl; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9d24aec064e8..a110e97e074a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -527,6 +527,11 @@ enum { #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) +struct ubuf_info_ops { + void (*complete)(struct sk_buff *, struct ubuf_info *, + bool zerocopy_success); +}; + /* * The callback notifies userspace to release buffers when skb DMA is done in * lower device, the skb last reference should be 0 when calling this. @@ -536,8 +541,7 @@ enum { * The desc field is used to track userspace buffer index. */ struct ubuf_info { - void (*callback)(struct sk_buff *, struct ubuf_info *, - bool zerocopy_success); + const struct ubuf_info_ops *ops; refcount_t refcnt; u8 flags; }; @@ -1662,14 +1666,13 @@ static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset) } #endif +extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; + struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success); - int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, struct sk_buff *skb, struct iov_iter *from, size_t length); @@ -1757,13 +1760,13 @@ static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb) static inline void net_zcopy_put(struct ubuf_info *uarg) { if (uarg) - uarg->callback(NULL, uarg, true); + uarg->ops->complete(NULL, uarg, true); } static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) { if (uarg) { - if (uarg->callback == msg_zerocopy_callback) + if (uarg->ops == &msg_zerocopy_ubuf_ops) msg_zerocopy_put_abort(uarg, have_uref); else if (have_uref) net_zcopy_put(uarg); @@ -1777,7 +1780,7 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success) if (uarg) { if (!skb_zcopy_is_nouarg(skb)) - uarg->callback(skb, uarg, zerocopy_success); + uarg->ops->complete(skb, uarg, zerocopy_success); skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; } diff --git a/io_uring/notif.c b/io_uring/notif.c index d3e703c37aba..204f0fc7d966 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -24,7 +24,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state io_req_task_complete(notif, ts); } -static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, +static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); @@ -45,19 +45,27 @@ static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, else if (!success && !nd->zc_copied) WRITE_ONCE(nd->zc_copied, true); } - io_tx_ubuf_callback(skb, uarg, success); + io_tx_ubuf_complete(skb, uarg, success); } +static const struct ubuf_info_ops io_ubuf_ops = { + .complete = io_tx_ubuf_complete, +}; + +static const struct ubuf_info_ops io_ubuf_ops_ext = { + .complete = io_tx_ubuf_callback_ext, +}; + void io_notif_set_extended(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { + if (nd->uarg.ops != &io_ubuf_ops_ext) { nd->account_pages = 0; nd->zc_report = false; nd->zc_used = false; nd->zc_copied = false; - nd->uarg.callback = io_tx_ubuf_callback_ext; + nd->uarg.ops = &io_ubuf_ops_ext; notif->io_task_work.func = io_notif_complete_tw_ext; } } @@ -80,7 +88,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd = io_notif_to_data(notif); nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; - nd->uarg.callback = io_tx_ubuf_callback; + nd->uarg.ops = &io_ubuf_ops; refcount_set(&nd->uarg.refcnt, 1); return notif; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b99127712e67..9d36750459f3 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1708,7 +1708,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->ubuf.callback = msg_zerocopy_callback; + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; @@ -1734,7 +1734,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, u32 bytelen, next; /* there might be non MSG_ZEROCOPY users */ - if (uarg->callback != msg_zerocopy_callback) + if (uarg->ops != &msg_zerocopy_ubuf_ops) return NULL; /* realloc only when socket is locked (TCP, UDP cork), @@ -1845,8 +1845,8 @@ release: sock_put(sk); } -void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); @@ -1855,7 +1855,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, if (refcount_dec_and_test(&uarg->refcnt)) __msg_zerocopy_callback(uarg_zc); } -EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { @@ -1865,10 +1864,15 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) uarg_to_msgzc(uarg)->len--; if (have_uref) - msg_zerocopy_callback(NULL, uarg, true); + msg_zerocopy_complete(NULL, uarg, true); } EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); +const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { + .complete = msg_zerocopy_complete, +}; +EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); + int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct msghdr *msg, int len, struct ubuf_info *uarg) -- cgit v1.2.3 From 65bada80dec1f2108a751644773b2120bd789934 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Apr 2024 12:08:40 +0100 Subject: net: add callback for setting a ubuf_info to skb At the moment an skb can only have one ubuf_info associated with it, which might be a performance problem for zerocopy sends in cases like TCP via io_uring. Add a callback for assigning ubuf_info to skb, this way we will implement smarter assignment later like linking ubuf_info together. Note, it's an optional callback, which should be compatible with skb_zcopy_set(), that's because the net stack might potentially decide to clone an skb and take another reference to ubuf_info whenever it wishes. Also, a correct implementation should always be able to bind to an skb without prior ubuf_info, otherwise we could end up in a situation when the send would not be able to progress. Reviewed-by: Jens Axboe Reviewed-by: David Ahern Signed-off-by: Pavel Begunkov Reviewed-by: Willem de Bruijn Link: https://lore.kernel.org/all/b7918aadffeb787c84c9e72e34c729dc04f3a45d.1713369317.git.asml.silence@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/skbuff.h | 2 ++ net/core/skbuff.c | 20 ++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a110e97e074a..ced69f37977f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -530,6 +530,8 @@ enum { struct ubuf_info_ops { void (*complete)(struct sk_buff *, struct ubuf_info *, bool zerocopy_success); + /* has to be compatible with skb_zcopy_set() */ + int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg); }; /* diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9d36750459f3..4a0c10685cd2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1880,11 +1880,18 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, struct ubuf_info *orig_uarg = skb_zcopy(skb); int err, orig_len = skb->len; - /* An skb can only point to one uarg. This edge case happens when - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. - */ - if (orig_uarg && uarg != orig_uarg) - return -EEXIST; + if (uarg->ops->link_skb) { + err = uarg->ops->link_skb(skb, uarg); + if (err) + return err; + } else { + /* An skb can only point to one uarg. This edge case happens + * when TCP appends to an skb, but zerocopy_realloc triggered + * a new alloc. + */ + if (orig_uarg && uarg != orig_uarg) + return -EEXIST; + } err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { @@ -1898,7 +1905,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, return err; } - skb_zcopy_set(skb, uarg, NULL); + if (!uarg->ops->link_skb) + skb_zcopy_set(skb, uarg, NULL); return skb->len - orig_len; } EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); -- cgit v1.2.3 From 5a569469b973cb7a6c58192a37dfb8418686e518 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Apr 2024 12:08:41 +0100 Subject: io_uring/notif: simplify io_notif_flush() io_notif_flush() is partially duplicating io_tx_ubuf_complete(), so instead of duplicating it, make the flush call io_tx_ubuf_complete. Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/19e41652c16718b946a5c80d2ad409df7682e47e.1713369317.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 6 +++--- io_uring/notif.h | 9 +++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index 53532d78a947..26680176335f 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,7 +9,7 @@ #include "notif.h" #include "rsrc.h" -void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); @@ -23,8 +23,8 @@ void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) io_req_task_complete(notif, ts); } -static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, - bool success) +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success) { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); diff --git a/io_uring/notif.h b/io_uring/notif.h index 2e25a2fc77d1..2cf9ff6abd7a 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -21,7 +21,8 @@ struct io_notif_data { }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx); -void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts); +void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, + bool success); static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif) { @@ -33,11 +34,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) { struct io_notif_data *nd = io_notif_to_data(notif); - /* drop slot's master ref */ - if (refcount_dec_and_test(&nd->uarg.refcnt)) { - notif->io_task_work.func = io_notif_tw_complete; - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); - } + io_tx_ubuf_complete(NULL, &nd->uarg, true); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) -- cgit v1.2.3 From 6fe4220912d19152a26ce19713ab232f4263018d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 19 Apr 2024 12:08:42 +0100 Subject: io_uring/notif: implement notification stacking The network stack allows only one ubuf_info per skb, and unlike MSG_ZEROCOPY, each io_uring zerocopy send will carry a separate ubuf_info. That means that send requests can't reuse a previosly allocated skb and need to get one more or more of new ones. That's fine for large sends, but otherwise it would spam the stack with lots of skbs carrying just a little data each. To help with that implement linking notification (i.e. an io_uring wrapper around ubuf_info) into a list. Each is refcounted by skbs and the stack as usual. additionally all non head entries keep a reference to the head, which they put down when their refcount hits 0. When the head have no more users, it'll efficiently put all notifications in a batch. As mentioned previously about ->io_link_skb, the callback implementation always allows to bind to an skb without a ubuf_info. Reviewed-by: Jens Axboe Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/bf1e7f9b72f9ecc99999fdc0d2cded5eea87fd0b.1713369317.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++------ io_uring/notif.h | 3 +++ 2 files changed, 67 insertions(+), 7 deletions(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index 26680176335f..d58cdc01e691 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,18 +9,28 @@ #include "notif.h" #include "rsrc.h" +static const struct ubuf_info_ops io_ubuf_ops; + static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); - if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) - notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + do { + notif = cmd_to_io_kiocb(nd); - if (nd->account_pages && notif->ctx->user) { - __io_unaccount_mem(notif->ctx->user, nd->account_pages); - nd->account_pages = 0; - } - io_req_task_complete(notif, ts); + lockdep_assert(refcount_read(&nd->uarg.refcnt) == 0); + + if (unlikely(nd->zc_report) && (nd->zc_copied || !nd->zc_used)) + notif->cqe.res |= IORING_NOTIF_USAGE_ZC_COPIED; + + if (nd->account_pages && notif->ctx->user) { + __io_unaccount_mem(notif->ctx->user, nd->account_pages); + nd->account_pages = 0; + } + + nd = nd->next; + io_req_task_complete(notif, ts); + } while (nd); } void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, @@ -39,12 +49,56 @@ void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, if (!refcount_dec_and_test(&uarg->refcnt)) return; + if (nd->head != nd) { + io_tx_ubuf_complete(skb, &nd->head->uarg, success); + return; + } notif->io_task_work.func = io_notif_tw_complete; __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } +static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) +{ + struct io_notif_data *nd, *prev_nd; + struct io_kiocb *prev_notif, *notif; + struct ubuf_info *prev_uarg = skb_zcopy(skb); + + nd = container_of(uarg, struct io_notif_data, uarg); + notif = cmd_to_io_kiocb(nd); + + if (!prev_uarg) { + net_zcopy_get(&nd->uarg); + skb_zcopy_init(skb, &nd->uarg); + return 0; + } + /* handle it separately as we can't link a notif to itself */ + if (unlikely(prev_uarg == &nd->uarg)) + return 0; + /* we can't join two links together, just request a fresh skb */ + if (unlikely(nd->head != nd || nd->next)) + return -EEXIST; + /* don't mix zc providers */ + if (unlikely(prev_uarg->ops != &io_ubuf_ops)) + return -EEXIST; + + prev_nd = container_of(prev_uarg, struct io_notif_data, uarg); + prev_notif = cmd_to_io_kiocb(nd); + + /* make sure all noifications can be finished in the same task_work */ + if (unlikely(notif->ctx != prev_notif->ctx || + notif->task != prev_notif->task)) + return -EEXIST; + + nd->head = prev_nd->head; + nd->next = prev_nd->next; + prev_nd->next = nd; + net_zcopy_get(&nd->head->uarg); + return 0; +} + static const struct ubuf_info_ops io_ubuf_ops = { .complete = io_tx_ubuf_complete, + .link_skb = io_link_skb, }; struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) @@ -65,6 +119,9 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) nd = io_notif_to_data(notif); nd->zc_report = false; nd->account_pages = 0; + nd->next = NULL; + nd->head = nd; + nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; nd->uarg.ops = &io_ubuf_ops; refcount_set(&nd->uarg.refcnt, 1); diff --git a/io_uring/notif.h b/io_uring/notif.h index 2cf9ff6abd7a..f3589cfef4a9 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -14,6 +14,9 @@ struct io_notif_data { struct file *file; struct ubuf_info uarg; + struct io_notif_data *next; + struct io_notif_data *head; + unsigned account_pages; bool zc_report; bool zc_used; -- cgit v1.2.3 From 039a2e800bcd5beb89909d1a488abf3d647642cf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 25 Apr 2024 09:04:32 -0600 Subject: io_uring/rw: reinstate thread check for retries Allowing retries for everything is arguably the right thing to do, now that every command type is async read from the start. But it's exposed a few issues around missing check for a retry (which cca6571381a0 exposed), and the fixup commit for that isn't necessarily 100% sound in terms of iov_iter state. For now, just revert these two commits. This unfortunately then re-opens the fact that -EAGAIN can get bubbled to userspace for some cases where the kernel very well could just sanely retry them. But until we have all the conditions covered around that, we cannot safely enable that. This reverts commit df604d2ad480fcf7b39767280c9093e13b1de952. This reverts commit cca6571381a0bdc88021a1f7a4c2349df21279f7. Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 ------------- io_uring/io_uring.h | 1 - io_uring/rw.c | 40 +++++++++++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 25 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 64845634d89f..2675cffbd9a4 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -527,19 +527,6 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_tw_requeue_iowq(struct io_kiocb *req, struct io_tw_state *ts) -{ - req->flags &= ~REQ_F_REISSUE; - io_queue_iowq(req); -} - -void io_tw_queue_iowq(struct io_kiocb *req) -{ - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - req->io_task_work.func = io_tw_requeue_iowq; - io_req_task_work_add(req); -} - static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index b83a719c5443..624ca9076a50 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,7 +75,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_tw_queue_iowq(struct io_kiocb *req); void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index 4fed829fe97c..a6bf2ea8db91 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -396,9 +396,16 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } +#ifdef CONFIG_BLOCK +static void io_resubmit_prep(struct io_kiocb *req) +{ + struct io_async_rw *io = req->async_data; + + iov_iter_restore(&io->iter, &io->iter_state); +} + static bool io_rw_should_reissue(struct io_kiocb *req) { -#ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; struct io_ring_ctx *ctx = req->ctx; @@ -414,11 +421,23 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; + /* + * Play it safe and assume not safe to re-import and reissue if we're + * not in the original thread group (or in task context). + */ + if (!same_thread_group(req->task, current) || !in_task()) + return false; return true; +} #else +static void io_resubmit_prep(struct io_kiocb *req) +{ +} +static bool io_rw_should_reissue(struct io_kiocb *req) +{ return false; -#endif } +#endif static void io_req_end_write(struct io_kiocb *req) { @@ -455,7 +474,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) * current cycle. */ io_req_io_end(req); - io_tw_queue_iowq(req); + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; return true; } req_set_fail(req); @@ -521,7 +540,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { if (res == -EAGAIN && io_rw_should_reissue(req)) { - io_tw_queue_iowq(req); + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; return; } req->cqe.res = res; @@ -583,10 +602,8 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, } if (req->flags & REQ_F_REISSUE) { - struct io_async_rw *io = req->async_data; - req->flags &= ~REQ_F_REISSUE; - iov_iter_restore(&io->iter, &io->iter_state); + io_resubmit_prep(req); return -EAGAIN; } return IOU_ISSUE_SKIP_COMPLETE; @@ -839,8 +856,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(rw, &io->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - if (req->flags & REQ_F_REISSUE) - return IOU_ISSUE_SKIP_COMPLETE; + req->flags &= ~REQ_F_REISSUE; /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -1035,8 +1051,10 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) - return IOU_ISSUE_SKIP_COMPLETE; + if (req->flags & REQ_F_REISSUE) { + req->flags &= ~REQ_F_REISSUE; + ret2 = -EAGAIN; + } /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just -- cgit v1.2.3 From a4d416dc60980f741f0bfa1f34a1059c498c1b4e Mon Sep 17 00:00:00 2001 From: linke li Date: Fri, 26 Apr 2024 11:24:37 +0800 Subject: io_uring/msg_ring: reuse ctx->submitter_task read using READ_ONCE instead of re-reading it In io_msg_exec_remote(), ctx->submitter_task is read using READ_ONCE at the beginning of the function, checked, and then re-read from ctx->submitter_task, voiding all guarantees of the checks. Reuse the value that was read by READ_ONCE to ensure the consistency of the task struct throughout the function. Signed-off-by: linke li Link: https://lore.kernel.org/r/tencent_F9B2296C93928D6F68FF0C95C33475C68209@qq.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index cd6dcf634ba3..d74670b39ed6 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -83,7 +83,7 @@ static int io_msg_exec_remote(struct io_kiocb *req, task_work_func_t func) return -EOWNERDEAD; init_task_work(&msg->tw, func); - if (task_work_add(ctx->submitter_task, &msg->tw, TWA_SIGNAL)) + if (task_work_add(task, &msg->tw, TWA_SIGNAL)) return -EOWNERDEAD; return IOU_ISSUE_SKIP_COMPLETE; -- cgit v1.2.3 From ef42b85a5609cd822ca0a68dd2bef2b12b5d1ca3 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Apr 2024 16:42:30 +0100 Subject: io_uring/net: fix sendzc lazy wake polling SEND[MSG]_ZC produces multiple CQEs via notifications, LAZY_WAKE doesn't handle it and so disable LAZY_WAKE for sendzc polling. It should be fine, sends are not likely to be polled in the first place. Fixes: 6ce4a93dbb5b ("io_uring/poll: use IOU_F_TWQ_LAZY_WAKE for wakeups") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5b360fb352d91e3aec751d75c87dfb4753a084ee.1714488419.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/net.c b/io_uring/net.c index 51c41d771c50..503debecad32 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1198,6 +1198,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; -- cgit v1.2.3 From 19352a1d395424b5f8c03289a85fbd6622d6601a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 30 Apr 2024 16:42:31 +0100 Subject: io_uring/notif: disable LAZY_WAKE for linked notifs Notifications may now be linked and thus a single tw can post multiple CQEs, it's not safe to use LAZY_WAKE with them. Disable LAZY_WAKE for now, if that'd prove to be a problem we can count them and pass the expected number of CQEs into __io_req_task_work_add(). Fixes: 6fe4220912d19 ("io_uring/notif: implement notification stacking") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0a5accdb7d2d0d27ebec14f8106e14e0192fae17.1714488419.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/notif.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/io_uring/notif.c b/io_uring/notif.c index d58cdc01e691..28859ae3ee6e 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -38,6 +38,7 @@ void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, { struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); struct io_kiocb *notif = cmd_to_io_kiocb(nd); + unsigned tw_flags; if (nd->zc_report) { if (success && !nd->zc_used && skb) @@ -53,8 +54,10 @@ void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, io_tx_ubuf_complete(skb, &nd->head->uarg, success); return; } + + tw_flags = nd->next ? 0 : IOU_F_TWQ_LAZY_WAKE; notif->io_task_work.func = io_notif_tw_complete; - __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); + __io_req_task_work_add(notif, tw_flags); } static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) -- cgit v1.2.3 From 79996b45f7b28c0e3e08a95bab80119e95317e28 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 1 May 2024 14:56:36 -0600 Subject: io_uring: Require zeroed sqe->len on provided-buffers send When sending from a provided buffer, we set sr->len to be the smallest between the actual buffer size and sqe->len. But, now that we disconnect the buffer from the submission request, we can get in a situation where the buffers and requests mismatch, and only part of a buffer gets sent. Assume: * buf[1]->len = 128; buf[2]->len = 256 * sqe[1]->len = 128; sqe[2]->len = 256 If sqe1 runs first, it picks buff[1] and it's all good. But, if sqe[2] runs first, sqe[1] picks buff[2], and the last half of buff[2] is never sent. While arguably the use-case of different-length sends is questionable, it has already raised confusion with potential users of this feature. Let's make the interface less tricky by forcing the length to only come from the buffer ring entry itself. Fixes: ac5f71a3d9d7 ("io_uring/net: add provided buffer support for IORING_OP_SEND") Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- io_uring/net.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/net.c b/io_uring/net.c index 503debecad32..b0bf8471ecb7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -423,6 +423,8 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } + if (req->flags & REQ_F_BUFFER_SELECT && sr->len) + return -EINVAL; #ifdef CONFIG_COMPAT if (req->ctx->compat) @@ -586,7 +588,7 @@ retry_bundle: if (io_do_buffer_select(req)) { struct buf_sel_arg arg = { .iovs = &kmsg->fast_iov, - .max_len = min_not_zero(sr->len, INT_MAX), + .max_len = INT_MAX, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, }; -- cgit v1.2.3 From 59b28a6e37e650c0d601ed87875b6217140cda5d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 28 Mar 2024 10:42:40 -0600 Subject: io_uring/msg_ring: cleanup posting to IOPOLL vs !IOPOLL ring Move the posting outside the checking and locking, it's cleaner that way. Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index d74670b39ed6..81c4a9d43729 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -147,13 +147,11 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; - io_double_unlock_ctx(target_ctx); - } else { - if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) - ret = 0; } + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) + ret = 0; + if (target_ctx->flags & IORING_SETUP_IOPOLL) + io_double_unlock_ctx(target_ctx); return ret; } -- cgit v1.2.3 From 8a565304927fbd28c9f028c492b5c1714002cbab Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 7 May 2024 10:00:01 -0700 Subject: io_uring/io-wq: Use set_bit() and test_bit() at worker->flags Utilize set_bit() and test_bit() on worker->flags within io_uring/io-wq to address potential data races. The structure io_worker->flags may be accessed through various data paths, leading to concurrency issues. When KCSAN is enabled, it reveals data races occurring in io_worker_handle_work and io_wq_activate_free_worker functions. BUG: KCSAN: data-race in io_worker_handle_work / io_wq_activate_free_worker write to 0xffff8885c4246404 of 4 bytes by task 49071 on cpu 28: io_worker_handle_work (io_uring/io-wq.c:434 io_uring/io-wq.c:569) io_wq_worker (io_uring/io-wq.c:?) read to 0xffff8885c4246404 of 4 bytes by task 49024 on cpu 5: io_wq_activate_free_worker (io_uring/io-wq.c:? io_uring/io-wq.c:285) io_wq_enqueue (io_uring/io-wq.c:947) io_queue_iowq (io_uring/io_uring.c:524) io_req_task_submit (io_uring/io_uring.c:1511) io_handle_tw_list (io_uring/io_uring.c:1198) Line numbers against commit 18daea77cca6 ("Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm"). These races involve writes and reads to the same memory location by different tasks running on different CPUs. To mitigate this, refactor the code to use atomic operations such as set_bit(), test_bit(), and clear_bit() instead of basic "and" and "or" operations. This ensures thread-safe manipulation of worker flags. Also, move `create_index` to avoid holes in the structure. Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20240507170002.2269003-1-leitao@debian.org Signed-off-by: Jens Axboe --- io_uring/io-wq.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index d7fc6f6d4477..d1c47a9d9215 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -25,10 +25,10 @@ #define WORKER_IDLE_TIMEOUT (5 * HZ) enum { - IO_WORKER_F_UP = 1, /* up and active */ - IO_WORKER_F_RUNNING = 2, /* account as running */ - IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_BOUND = 8, /* is doing bounded work */ + IO_WORKER_F_UP = 0, /* up and active */ + IO_WORKER_F_RUNNING = 1, /* account as running */ + IO_WORKER_F_FREE = 2, /* worker on free list */ + IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -44,7 +44,8 @@ enum { */ struct io_worker { refcount_t ref; - unsigned flags; + int create_index; + unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; @@ -57,7 +58,6 @@ struct io_worker { unsigned long create_state; struct callback_head create_work; - int create_index; union { struct rcu_head rcu; @@ -164,7 +164,7 @@ static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, worker->flags & IO_WORKER_F_BOUND); + return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); } static void io_worker_ref_put(struct io_wq *wq) @@ -224,7 +224,7 @@ static void io_worker_exit(struct io_worker *worker) wait_for_completion(&worker->ref_done); raw_spin_lock(&wq->lock); - if (worker->flags & IO_WORKER_F_FREE) + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); raw_spin_unlock(&wq->lock); @@ -409,7 +409,7 @@ static void io_wq_dec_running(struct io_worker *worker) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; if (!atomic_dec_and_test(&acct->nr_running)) @@ -429,8 +429,8 @@ static void io_wq_dec_running(struct io_worker *worker) */ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) { - if (worker->flags & IO_WORKER_F_FREE) { - worker->flags &= ~IO_WORKER_F_FREE; + if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { + clear_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_lock(&wq->lock); hlist_nulls_del_init_rcu(&worker->nulls_node); raw_spin_unlock(&wq->lock); @@ -443,8 +443,8 @@ static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) __must_hold(wq->lock) { - if (!(worker->flags & IO_WORKER_F_FREE)) { - worker->flags |= IO_WORKER_F_FREE; + if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { + set_bit(IO_WORKER_F_FREE, &worker->flags); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); } } @@ -632,7 +632,8 @@ static int io_wq_worker(void *data) bool exit_mask = false, last_timeout = false; char buf[TASK_COMM_LEN]; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); + set_mask_bits(&worker->flags, 0, + BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); @@ -696,11 +697,11 @@ void io_wq_worker_running(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (worker->flags & IO_WORKER_F_RUNNING) + if (test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags |= IO_WORKER_F_RUNNING; + set_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_inc_running(worker); } @@ -714,12 +715,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) if (!worker) return; - if (!(worker->flags & IO_WORKER_F_UP)) + if (!test_bit(IO_WORKER_F_UP, &worker->flags)) return; - if (!(worker->flags & IO_WORKER_F_RUNNING)) + if (!test_bit(IO_WORKER_F_RUNNING, &worker->flags)) return; - worker->flags &= ~IO_WORKER_F_RUNNING; + clear_bit(IO_WORKER_F_RUNNING, &worker->flags); io_wq_dec_running(worker); } @@ -733,7 +734,7 @@ static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, raw_spin_lock(&wq->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); list_add_tail_rcu(&worker->all_list, &wq->all_list); - worker->flags |= IO_WORKER_F_FREE; + set_bit(IO_WORKER_F_FREE, &worker->flags); raw_spin_unlock(&wq->lock); wake_up_new_task(tsk); } @@ -839,7 +840,7 @@ fail: init_completion(&worker->ref_done); if (index == IO_WQ_ACCT_BOUND) - worker->flags |= IO_WORKER_F_BOUND; + set_bit(IO_WORKER_F_BOUND, &worker->flags); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { @@ -925,8 +926,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { struct io_wq_acct *acct = io_work_get_acct(wq, work); + unsigned long work_flags = work->flags; struct io_cb_cancel_data match; - unsigned work_flags = work->flags; bool do_create; /* -- cgit v1.2.3 From 340f634aa43d4172771a784da31e5d4c7c7d3126 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 May 2024 15:09:02 -0600 Subject: io_uring/filetable: don't unnecessarily clear/reset bitmap If we're updating an existing slot, we clear the slot bitmap only to set it again right after. Just leave the bit set rather than toggle it off and on, and move the unused slot setting into the branch of not already having a file occupy this slot. Signed-off-by: Jens Axboe --- io_uring/filetable.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 6e86e6188dbe..997c56d32ee6 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -84,12 +84,12 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, return ret; file_slot->file_ptr = 0; - io_file_bitmap_clear(&ctx->file_table, slot_index); + } else { + io_file_bitmap_set(&ctx->file_table, slot_index); } *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); return 0; } -- cgit v1.2.3 From 7dcc758cca432510f77b2fe1077be2314bc3785b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 May 2024 14:06:15 -0600 Subject: io_uring/net: add IORING_ACCEPT_DONTWAIT flag This allows the caller to perform a non-blocking attempt, similarly to how recvmsg has MSG_DONTWAIT. If set, and we get -EAGAIN on a connection attempt, propagate the result to userspace rather than arm poll and wait for a retry. Suggested-by: Norman Maurer Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/net.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f093cb2300d9..4a645d15516f 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -379,6 +379,7 @@ enum io_uring_op { * accept flags stored in sqe->ioprio */ #define IORING_ACCEPT_MULTISHOT (1U << 0) +#define IORING_ACCEPT_DONTWAIT (1U << 1) /* * IORING_OP_MSG_RING command types, stored in sqe->addr diff --git a/io_uring/net.c b/io_uring/net.c index b0bf8471ecb7..7861bc8fe8b1 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -28,6 +28,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + int iou_flags; u32 file_slot; unsigned long nofile; }; @@ -1489,7 +1490,6 @@ void io_sendrecv_fail(struct io_kiocb *req) int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); - unsigned flags; if (sqe->len || sqe->buf_index) return -EINVAL; @@ -1498,15 +1498,15 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); - flags = READ_ONCE(sqe->ioprio); - if (flags & ~IORING_ACCEPT_MULTISHOT) + accept->iou_flags = READ_ONCE(sqe->ioprio); + if (accept->iou_flags & ~(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT)) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); if (accept->file_slot) { if (accept->flags & SOCK_CLOEXEC) return -EINVAL; - if (flags & IORING_ACCEPT_MULTISHOT && + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT && accept->file_slot != IORING_FILE_INDEX_ALLOC) return -EINVAL; } @@ -1514,8 +1514,10 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - if (flags & IORING_ACCEPT_MULTISHOT) + if (accept->iou_flags & IORING_ACCEPT_MULTISHOT) req->flags |= REQ_F_APOLL_MULTISHOT; + if (accept->iou_flags & IORING_ACCEPT_DONTWAIT) + req->flags |= REQ_F_NOWAIT; return 0; } @@ -1540,7 +1542,8 @@ retry: if (!fixed) put_unused_fd(fd); ret = PTR_ERR(file); - if (ret == -EAGAIN && force_nonblock) { + if (ret == -EAGAIN && force_nonblock && + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { /* * if it's multishot and polled, we don't need to * return EAGAIN to arm the poll infra since it -- cgit v1.2.3 From d3da8e98592693811c14c31f05380f378411fea1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 8 May 2024 08:17:50 -0600 Subject: io_uring/net: add IORING_ACCEPT_POLL_FIRST flag Similarly to how polling first is supported for receive, it makes sense to provide the same for accept. An accept operation does a lot of expensive setup, like allocating an fd, a socket/inode, etc. If no connection request is already pending, this is wasted and will just be cleaned up and freed, only to retry via the usual poll trigger. Add IORING_ACCEPT_POLL_FIRST, which tells accept to only initiate the accept request if poll says we have something to accept. Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/net.c | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4a645d15516f..6dbac55f8686 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -380,6 +380,7 @@ enum io_uring_op { */ #define IORING_ACCEPT_MULTISHOT (1U << 0) #define IORING_ACCEPT_DONTWAIT (1U << 1) +#define IORING_ACCEPT_POLL_FIRST (1U << 2) /* * IORING_OP_MSG_RING command types, stored in sqe->addr diff --git a/io_uring/net.c b/io_uring/net.c index 7861bc8fe8b1..070dea9a4eda 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -1487,6 +1487,9 @@ void io_sendrecv_fail(struct io_kiocb *req) req->cqe.flags |= IORING_CQE_F_MORE; } +#define ACCEPT_FLAGS (IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT | \ + IORING_ACCEPT_POLL_FIRST) + int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept); @@ -1499,7 +1502,7 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); accept->iou_flags = READ_ONCE(sqe->ioprio); - if (accept->iou_flags & ~(IORING_ACCEPT_MULTISHOT | IORING_ACCEPT_DONTWAIT)) + if (accept->iou_flags & ~ACCEPT_FLAGS) return -EINVAL; accept->file_slot = READ_ONCE(sqe->file_index); @@ -1530,6 +1533,10 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; + if (!(req->flags & REQ_F_POLLED) && + accept->iou_flags & IORING_ACCEPT_POLL_FIRST) + return -EAGAIN; + retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); -- cgit v1.2.3 From 3d8f874bd620ce03f75a5512847586828ab86544 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 10 May 2024 11:50:27 +0800 Subject: io_uring: fail NOP if non-zero op flags is passed in The NOP op flags should have been checked from beginning like any other opcode, otherwise NOP may not be extended with the op flags. Given both liburing and Rust io-uring crate always zeros SQE op flags, just ignore users which play raw NOP uring interface without zeroing SQE, because NOP is just for test purpose. Then we can save one NOP2 opcode. Suggested-by: Jens Axboe Fixes: 2b188cc1bb85 ("Add io_uring IO interface") Cc: stable@vger.kernel.org Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240510035031.78874-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- io_uring/nop.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/io_uring/nop.c b/io_uring/nop.c index d956599a3c1b..1a4e312dfe51 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -12,6 +12,8 @@ int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + if (READ_ONCE(sqe->rw_flags)) + return -EINVAL; return 0; } -- cgit v1.2.3 From deb1e496a83557896fe0cca0b8af01c2a97c0dc6 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 10 May 2024 11:50:28 +0800 Subject: io_uring: support to inject result for NOP Support to inject result for NOP so that we can inject failure from userspace. It is very helpful for covering failure handling code in io_uring core change. With nop flags, it becomes possible to add more test features on NOP in future. Suggested-by: Jens Axboe Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20240510035031.78874-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 8 ++++++++ io_uring/nop.c | 26 +++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6dbac55f8686..994bf7af0efe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -72,6 +72,7 @@ struct io_uring_sqe { __u32 waitid_flags; __u32 futex_flags; __u32 install_fd_flags; + __u32 nop_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -407,6 +408,13 @@ enum io_uring_msg_ring_flags { */ #define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) +/* + * IORING_OP_NOP flags (sqe->nop_flags) + * + * IORING_NOP_INJECT_RESULT Inject result from sqe->result + */ +#define IORING_NOP_INJECT_RESULT (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ diff --git a/io_uring/nop.c b/io_uring/nop.c index 1a4e312dfe51..a5bcf3d6984f 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -10,18 +10,34 @@ #include "io_uring.h" #include "nop.h" +struct io_nop { + /* NOTE: kiocb has the file as the first member, so don't do it here */ + struct file *file; + int result; +}; + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (READ_ONCE(sqe->rw_flags)) + unsigned int flags; + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + flags = READ_ONCE(sqe->nop_flags); + if (flags & ~IORING_NOP_INJECT_RESULT) return -EINVAL; + + if (flags & IORING_NOP_INJECT_RESULT) + nop->result = READ_ONCE(sqe->len); + else + nop->result = 0; return 0; } -/* - * IORING_OP_NOP just posts a completion event, nothing else. - */ int io_nop(struct io_kiocb *req, unsigned int issue_flags) { - io_req_set_res(req, 0, 0); + struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + + if (nop->result < 0) + req_set_fail(req); + io_req_set_res(req, nop->result, 0); return IOU_OK; } -- cgit v1.2.3