From f6f7f903e78dddcb1e1552b896e0e3e9c14c17ae Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:51 +0000 Subject: io_uring: kill io_poll_issue's PF_EXITING check We don't need to worry about checking PF_EXITING in io_poll_issue(). task works using the function should take care of it and never try to resubmit / retry if the task is dying. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2e9dc998dc07507c759a0c9cb5d2fbea0710d58c.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index adecdf65b130..15d285d8ce0f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1808,8 +1808,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) int io_poll_issue(struct io_kiocb *req, bool *locked) { io_tw_lock(req->ctx, locked); - if (unlikely(req->task->flags & PF_EXITING)) - return -EFAULT; return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } -- cgit v1.2.3 From 9805fa2d94993e16efd0e1adbd2b54d8d1fe2f9f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:52 +0000 Subject: io_uring: carve io_poll_check_events fast path The fast path in io_poll_check_events() is when we have only one (i.e. master) reference. Move all verification, cancellations checks, edge case handling and so on under a common if. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8c21c5d5e027e32dc553705e88796dec79ff6f93.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 599ba28c89b2..8987e13d302e 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -247,27 +247,30 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) do { v = atomic_read(&req->poll_refs); - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return IOU_POLL_DONE; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - /* - * cqe.res contains only events of the first wake up - * and all others are be lost. Redo vfs_poll() to get - * up to date state. - */ - if ((v & IO_POLL_REF_MASK) != 1) - req->cqe.res = 0; - if (v & IO_POLL_RETRY_FLAG) { - req->cqe.res = 0; + if (unlikely(v != 1)) { + /* tw should be the owner and so have some refs */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return IOU_POLL_DONE; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; /* - * We won't find new events that came in between - * vfs_poll and the ref put unless we clear the flag - * in advance. + * cqe.res contains only events of the first wake up + * and all others are to be lost. Redo vfs_poll() to get + * up to date state. */ - atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); - v &= ~IO_POLL_RETRY_FLAG; + if ((v & IO_POLL_REF_MASK) != 1) + req->cqe.res = 0; + + if (v & IO_POLL_RETRY_FLAG) { + req->cqe.res = 0; + /* + * We won't find new events that came in between + * vfs_poll and the ref put unless we clear the + * flag in advance. + */ + atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); + v &= ~IO_POLL_RETRY_FLAG; + } } /* the mask was stashed in __io_poll_execute */ -- cgit v1.2.3 From 047b6aef0966f9863e1940b57c256ebbb465a6b5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:53 +0000 Subject: io_uring: remove ctx variable in io_poll_check_events ctx is only used by io_poll_check_events() for multishot poll CQE posting, don't save it on stack in advance. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/552c1771f8a0e7688afdb4f538ead245f53e80e7.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 8987e13d302e..ada0017e3d88 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -237,7 +237,6 @@ enum { */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int v, ret; /* req->task == current here, checking PF_EXITING is safe */ @@ -289,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(ctx, *locked, req->cqe.user_data, + if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; -- cgit v1.2.3 From c3bfb57ea7011e0c04e4b7f28cb357a551b1efb9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:54 +0000 Subject: io_uring: improve poll warning handling Don't try to complete requests if their refs are broken and we've got a warning, it's much better to drop them and potentially leaking than double freeing. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/31edf9f96f05d03ab62c114508a231a2dce434cb.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index ada0017e3d88..8f16d2a48ff8 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -249,7 +249,7 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) if (unlikely(v != 1)) { /* tw should be the owner and so have some refs */ if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return IOU_POLL_DONE; + return IOU_POLL_NO_ACTION; if (v & IO_POLL_CANCEL_FLAG) return -ECANCELED; /* -- cgit v1.2.3 From 443e57550670234f1bd34983b3c577edcf2eeef5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:55 +0000 Subject: io_uring: combine poll tw handlers Merge apoll and regular poll tw handlers, it will help with inlining. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/482e59edb9fc81bd275fdbf486837330fb27120a.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/poll.c | 54 +++++++++++++++++++++--------------------------------- 1 file changed, 21 insertions(+), 33 deletions(-) (limited to 'io_uring') diff --git a/io_uring/poll.c b/io_uring/poll.c index 8f16d2a48ff8..ee7da6150ec4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -321,50 +321,38 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) ret = io_poll_check_events(req, locked); if (ret == IOU_POLL_NO_ACTION) return; - - if (ret == IOU_POLL_DONE) { - struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); - req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { - req->cqe.res = ret; - req_set_fail(req); - } - io_poll_remove_entries(req); io_poll_tw_hash_eject(req, locked); - io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); -} + if (req->opcode == IORING_OP_POLL_ADD) { + if (ret == IOU_POLL_DONE) { + struct io_poll *poll; -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - int ret; - - ret = io_poll_check_events(req, locked); - if (ret == IOU_POLL_NO_ACTION) - return; - - io_tw_lock(req->ctx, locked); - io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + poll = io_kiocb_to_cmd(req, struct io_poll); + req->cqe.res = mangle_poll(req->cqe.res & poll->events); + } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { + req->cqe.res = ret; + req_set_fail(req); + } - if (ret == IOU_POLL_REMOVE_POLL_USE_RES) + io_req_set_res(req, req->cqe.res, 0); io_req_task_complete(req, locked); - else if (ret == IOU_POLL_DONE) - io_req_task_submit(req, locked); - else - io_req_defer_failed(req, ret); + } else { + io_tw_lock(req->ctx, locked); + + if (ret == IOU_POLL_REMOVE_POLL_USE_RES) + io_req_task_complete(req, locked); + else if (ret == IOU_POLL_DONE) + io_req_task_submit(req, locked); + else + io_req_defer_failed(req, ret); + } } static void __io_poll_execute(struct io_kiocb *req, int mask) { io_req_set_res(req, mask, 0); - - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; + req->io_task_work.func = io_poll_task_func; trace_io_uring_task_add(req, mask); io_req_task_work_add(req); -- cgit v1.2.3 From 618d653a345a477aaae307a0455900eb8789e952 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:56 +0000 Subject: io_uring: don't raw spin unlock to match cq_lock There is one newly added place when we lock ring with io_cq_lock() but unlocking is hand coded calling spin_unlock directly. It's ugly and troublesome in the long run. Make it consistent with the other completion locking. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4ca4f0564492b90214a190cd5b2a6c76522de138.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/io_uring.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 15d285d8ce0f..c30765579a8e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -860,7 +860,7 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ - spin_unlock(&ctx->completion_lock); + io_cq_unlock(ctx); } /* For defered completions this is not as strict as it is otherwise, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 062899b1fe86..2277c05f52a6 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -93,6 +93,11 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) spin_lock(&ctx->completion_lock); } +static inline void io_cq_unlock(struct io_ring_ctx *ctx) +{ + spin_unlock(&ctx->completion_lock); +} + void io_cq_unlock_post(struct io_ring_ctx *ctx); static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, -- cgit v1.2.3 From 0ced756f6412123b01cd72e5741d9dd6ae5f1dd5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:57 +0000 Subject: io_uring: improve rsrc quiesce refs checks Do a little bit of refactoring of io_rsrc_ref_quiesce(), flatten the data refs checks and so get rid of a conditional weird unlock-else-break construct. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d21283e9f88a77612c746ed526d86fe3bfb58a70.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 133608200769..b36d32534165 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -330,17 +330,14 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); - if (atomic_read(&data->refs) > 0) { - /* - * it has been revived by another thread while - * we were unlocked - */ - mutex_unlock(&ctx->uring_lock); - } else { + if (atomic_read(&data->refs) <= 0) break; - } + /* + * it has been revived by another thread while + * we were unlocked + */ + mutex_unlock(&ctx->uring_lock); } - reinit: atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ -- cgit v1.2.3 From 77e3202a21967e7de5b4412c0534f2e34e175227 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Nov 2022 15:21:58 +0000 Subject: io_uring: don't reinstall quiesce node for each tw There is no need to reinit data and install a new rsrc node every time we get a task_work, it's detrimental, just execute it and conitnue waiting. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/3895d3344164cd9b3a0bbb24a6e357e20a13434b.1669821213.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/rsrc.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) (limited to 'io_uring') diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index b36d32534165..d25309400a45 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -309,22 +309,27 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, /* As we may drop ->uring_lock, other task may have started quiesce */ if (data->quiesce) return -ENXIO; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + return ret; + io_rsrc_node_switch(ctx, data); + + /* kill initial ref, already quiesced if zero */ + if (atomic_dec_and_test(&data->refs)) + return 0; data->quiesce = true; + mutex_unlock(&ctx->uring_lock); do { - ret = io_rsrc_node_switch_start(ctx); - if (ret) - break; - io_rsrc_node_switch(ctx, data); - - /* kill initial ref, already quiesced if zero */ - if (atomic_dec_and_test(&data->refs)) - break; - mutex_unlock(&ctx->uring_lock); - ret = io_run_task_work_sig(ctx); - if (ret < 0) - goto reinit; + if (ret < 0) { + atomic_inc(&data->refs); + /* wait for all works potentially completing data->done */ + flush_delayed_work(&ctx->rsrc_put_work); + reinit_completion(&data->done); + mutex_lock(&ctx->uring_lock); + break; + } flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); @@ -338,14 +343,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, */ mutex_unlock(&ctx->uring_lock); } -reinit: - atomic_inc(&data->refs); - /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); - reinit_completion(&data->done); - - mutex_lock(&ctx->uring_lock); - } while (ret >= 0); + } while (1); data->quiesce = false; return ret; -- cgit v1.2.3 From ef0ec1ad03119b8b46b035dad42bca7d6da7c2e5 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:26 +0000 Subject: io_uring: dont remove file from msg_ring reqs We should not be messing with req->file outside of core paths. Clearing it makes msg_ring non reentrant, i.e. luckily io_msg_send_fd() fails the request on failed io_double_lock_ctx() but clearly was originally intended to do retries instead. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e5ac9edadb574fe33f6d727cb8f14ce68262a684.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 2 +- io_uring/msg_ring.c | 4 ---- io_uring/opdef.c | 7 +++++++ io_uring/opdef.h | 2 ++ 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c30765579a8e..5fa92170c373 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1799,7 +1799,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); return 0; diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index afb543aab9f6..615c85e164ab 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -167,9 +167,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - /* put file to avoid an attempt to IOPOLL the req */ - if (!(req->flags & REQ_F_FIXED_FILE)) - io_put_file(req->file); - req->file = NULL; return IOU_OK; } diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 83dc0f9ad3b2..04dd2c983fce 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -63,6 +63,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "READV", .prep = io_prep_rw, @@ -80,6 +81,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "WRITEV", .prep = io_prep_rw, @@ -103,6 +105,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", .prep = io_prep_rw, @@ -118,6 +121,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", .prep = io_prep_rw, @@ -277,6 +281,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "READ", .prep = io_prep_rw, @@ -292,6 +297,7 @@ const struct io_op_def io_op_defs[] = { .audit_skip = 1, .ioprio = 1, .iopoll = 1, + .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .name = "WRITE", .prep = io_prep_rw, @@ -481,6 +487,7 @@ const struct io_op_def io_op_defs[] = { .plug = 1, .name = "URING_CMD", .iopoll = 1, + .iopoll_queue = 1, .async_size = uring_cmd_pdu_size(1), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 3efe06d25473..df7e13d9bfba 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -25,6 +25,8 @@ struct io_op_def { unsigned ioprio : 1; /* supports iopoll */ unsigned iopoll : 1; + /* have to be put into the iopoll list */ + unsigned iopoll_queue : 1; /* opcode specific path will handle ->async_data allocation if needed */ unsigned manual_alloc : 1; /* size of async data needed, if any */ -- cgit v1.2.3 From 4c979eaefa4356d385b7c7d2877dc04d7fe88969 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:27 +0000 Subject: io_uring: improve io_double_lock_ctx fail handling msg_ring will fail the request if it can't lock rings, instead punt it to io-wq as was originally intended. Cc: stable@vger.kernel.org Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4697f05afcc37df5c8f89e2fe6d9c7c19f0241f9.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 615c85e164ab..c7d6586164ca 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -164,6 +164,8 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) } done: + if (ret == -EAGAIN) + return -EAGAIN; if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); -- cgit v1.2.3 From a85381d8326d75417ae177bddf44be533d1d21be Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:28 +0000 Subject: io_uring: skip overflow CQE posting for dying ring After io_ring_ctx_wait_and_kill() is called there should be no users poking into rings and so there is no need to post CQEs. So, instead of trying to post overflowed CQEs into the CQ, drop them. Also, do it in io_ring_exit_work() in a loop to reduce the number of contexts it can be executed from and even when it struggles to quiesce the ring we won't be leaving memory allocated for longer than needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/26d13751155a735a3029e24f8d9ca992f810419d.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 5fa92170c373..ecd78898dbd3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -600,12 +600,30 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx) } /* Returns true if there are no backlogged entries after the flush */ -static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) +{ + struct io_overflow_cqe *ocqe; + LIST_HEAD(list); + + io_cq_lock(ctx); + list_splice_init(&ctx->cq_overflow_list, &list); + clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); + io_cq_unlock(ctx); + + while (!list_empty(&list)) { + ocqe = list_first_entry(&list, struct io_overflow_cqe, list); + list_del(&ocqe->list); + kfree(ocqe); + } +} + +/* Returns true if there are no backlogged entries after the flush */ +static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool all_flushed; size_t cqe_size = sizeof(struct io_uring_cqe); - if (!force && __io_cqring_events(ctx) == ctx->cq_entries) + if (__io_cqring_events(ctx) == ctx->cq_entries) return false; if (ctx->flags & IORING_SETUP_CQE32) @@ -616,15 +634,11 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true); struct io_overflow_cqe *ocqe; - if (!cqe && !force) + if (!cqe) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list); - if (cqe) - memcpy(cqe, &ocqe->cqe, cqe_size); - else - io_account_cq_overflow(ctx); - + memcpy(cqe, &ocqe->cqe, cqe_size); list_del(&ocqe->list); kfree(ocqe); } @@ -647,7 +661,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, false); + ret = __io_cqring_overflow_flush(ctx); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } @@ -1467,7 +1481,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) check_cq = READ_ONCE(ctx->check_cq); if (unlikely(check_cq)) { if (check_cq & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) - __io_cqring_overflow_flush(ctx, false); + __io_cqring_overflow_flush(ctx); /* * Similarly do not spin if we have not informed the user of any * dropped CQE. @@ -2635,8 +2649,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) __io_sqe_buffers_unregister(ctx); if (ctx->file_data) __io_sqe_files_unregister(ctx); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); + io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); io_alloc_cache_free(&ctx->apoll_cache, io_apoll_cache_free); io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); @@ -2777,6 +2790,12 @@ static __cold void io_ring_exit_work(struct work_struct *work) * as nobody else will be looking for them. */ do { + if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { + mutex_lock(&ctx->uring_lock); + io_cqring_overflow_kill(ctx); + mutex_unlock(&ctx->uring_lock); + } + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); @@ -2842,8 +2861,6 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); percpu_ref_kill(&ctx->refs); - if (ctx->rings) - __io_cqring_overflow_flush(ctx, true); xa_for_each(&ctx->personalities, index, creds) io_unregister_personality(ctx, index); if (ctx->rings) -- cgit v1.2.3 From 1b346e4aa8e79227391ffd6b7c6ee5acf0fa8bfc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:29 +0000 Subject: io_uring: don't check overflow flush failures The only way to fail overflowed CQEs flush is for CQ to be fully packed. There is one place checking for flush failures, i.e. io_cqring_wait(), but we limit the number to be waited for by the CQ size, so getting a failure automatically means that we're done with waiting. Don't check for failures, rarely but they might spuriously fail CQ waiting with -EBUSY. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6b720a45c03345655517f8202cbd0bece2848fb2.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ecd78898dbd3..ba5ef1e675c1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -618,13 +618,12 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx) } /* Returns true if there are no backlogged entries after the flush */ -static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx) { - bool all_flushed; size_t cqe_size = sizeof(struct io_uring_cqe); if (__io_cqring_events(ctx) == ctx->cq_entries) - return false; + return; if (ctx->flags & IORING_SETUP_CQE32) cqe_size <<= 1; @@ -643,30 +642,23 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx) kfree(ocqe); } - all_flushed = list_empty(&ctx->cq_overflow_list); - if (all_flushed) { + if (list_empty(&ctx->cq_overflow_list)) { clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq); atomic_andnot(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - io_cq_unlock_post(ctx); - return all_flushed; } -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) +static void io_cqring_overflow_flush(struct io_ring_ctx *ctx) { - bool ret = true; - if (test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)) { /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx); + __io_cqring_overflow_flush(ctx); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } - - return ret; } void __io_put_task(struct task_struct *task, int nr) @@ -2494,11 +2486,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, trace_io_uring_cqring_wait(ctx, min_events); do { - /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx)) { - ret = -EBUSY; - break; - } + io_cqring_overflow_flush(ctx); prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, TASK_INTERRUPTIBLE); ret = io_cqring_wait_schedule(ctx, &iowq, timeout); -- cgit v1.2.3 From e6aeb2721d3bad8379c43644d0380908e93b0187 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:30 +0000 Subject: io_uring: complete all requests in task context This patch adds ctx->task_complete flag. If set, we'll complete all requests in the context of the original task. Note, this extends to completion CQE posting only but not io_kiocb cleanup / free, e.g. io-wq may free the requests in the free calllback. This flag will be used later for optimisations purposes. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/21ece72953f76bb2e77659a72a14326227ab6460.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 2 ++ include/linux/io_uring_types.h | 2 ++ io_uring/io_uring.c | 14 +++++++++++--- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 29e519752da4..934e5dd4ccc0 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -11,6 +11,8 @@ enum io_uring_cmd_flags { IO_URING_F_UNLOCKED = 2, /* the request is executed from poll, it should not be freed */ IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, /* int's last bit, sign checks are usually faster than a bit test */ IO_URING_F_NONBLOCK = INT_MIN, diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index accdfecee953..6be1e1359c89 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -208,6 +208,8 @@ struct io_ring_ctx { unsigned int drain_disabled: 1; unsigned int has_evfd: 1; unsigned int syscall_iopoll: 1; + /* all CQEs should be posted only by the submitter task */ + unsigned int task_complete: 1; } ____cacheline_aligned_in_smp; /* submission data */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ba5ef1e675c1..13d56472dd49 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -921,8 +921,11 @@ static void __io_req_complete_post(struct io_kiocb *req) void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { - if (!(issue_flags & IO_URING_F_UNLOCKED) || - !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (req->ctx->task_complete && (issue_flags & IO_URING_F_IOWQ)) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + } else if (!(issue_flags & IO_URING_F_UNLOCKED) || + !(req->ctx->flags & IORING_SETUP_IOPOLL)) { __io_req_complete_post(req); } else { struct io_ring_ctx *ctx = req->ctx; @@ -1830,7 +1833,7 @@ void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); const struct io_op_def *def = &io_op_defs[req->opcode]; - unsigned int issue_flags = IO_URING_F_UNLOCKED; + unsigned int issue_flags = IO_URING_F_UNLOCKED | IO_URING_F_IOWQ; bool needs_poll = false; int ret = 0, err = -ECANCELED; @@ -3490,6 +3493,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (!ctx) return -ENOMEM; + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && + !(ctx->flags & IORING_SETUP_IOPOLL) && + !(ctx->flags & IORING_SETUP_SQPOLL)) + ctx->task_complete = true; + /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events -- cgit v1.2.3 From 17add5cea2bbafea0d481f1a3ea9dea019a98ee9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:31 +0000 Subject: io_uring: force multishot CQEs into task context Multishot are posting CQEs outside of the normal request completion path, which is usually done from within a task work handler. However, it might be not the case when it's yet to be polled but has been punted to io-wq. Make it abide ->task_complete and push it to the polling path when executed by io-wq. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d7714aaff583096769a0f26e8e747759e556feb1.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/net.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'io_uring') diff --git a/io_uring/net.c b/io_uring/net.c index 90342dcb6b1d..f276f6dd5b09 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -67,6 +67,19 @@ struct io_sr_msg { struct io_kiocb *notif; }; +static inline bool io_check_multishot(struct io_kiocb *req, + unsigned int issue_flags) +{ + /* + * When ->locked_cq is set we only allow to post CQEs from the original + * task context. Usual request completions will be handled in other + * generic paths but multipoll may decide to post extra cqes. + */ + return !(issue_flags & IO_URING_F_IOWQ) || + !(issue_flags & IO_URING_F_MULTISHOT) || + !req->ctx->task_complete; +} + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -730,6 +743,9 @@ int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return io_setup_async_msg(req, kmsg, issue_flags); + if (!io_check_multishot(req, issue_flags)) + return io_setup_async_msg(req, kmsg, issue_flags); + retry_multishot: if (io_do_buffer_select(req)) { void __user *buf; @@ -829,6 +845,9 @@ int io_recv(struct io_kiocb *req, unsigned int issue_flags) (sr->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; + if (!io_check_multishot(req, issue_flags)) + return -EAGAIN; + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -1280,6 +1299,8 @@ int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct file *file; int ret, fd; + if (!io_check_multishot(req, issue_flags)) + return -EAGAIN; retry: if (!fixed) { fd = __get_unused_fd_flags(accept->flags, accept->nofile); -- cgit v1.2.3 From d34b1b0b6779d4f5ee877b53cad90eef0f1cbe34 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:32 +0000 Subject: io_uring: use tw for putting rsrc Use task_work for completing rsrc removals, it'll be needed later for spinlock optimisations. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/cbba5d53a11ee6fc2194dacea262c1d733c8b529.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 1 + io_uring/rsrc.c | 19 +++++++++++++++++-- io_uring/rsrc.h | 1 + 4 files changed, 20 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 6be1e1359c89..dcd8a563ab52 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -328,6 +328,7 @@ struct io_ring_ctx { struct io_rsrc_data *buf_data; struct delayed_work rsrc_put_work; + struct callback_head rsrc_put_tw; struct llist_head rsrc_put_llist; struct list_head rsrc_ref_list; spinlock_t rsrc_ref_lock; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 13d56472dd49..a29cbf973287 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -326,6 +326,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); + init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d25309400a45..18de10c68a15 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -204,6 +204,14 @@ void io_rsrc_put_work(struct work_struct *work) } } +void io_rsrc_put_tw(struct callback_head *cb) +{ + struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, + rsrc_put_tw); + + io_rsrc_put_work(&ctx->rsrc_put_work.work); +} + void io_wait_rsrc_data(struct io_rsrc_data *data) { if (data && !atomic_dec_and_test(&data->refs)) @@ -242,8 +250,15 @@ static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) } spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); + if (!first_add) + return; + + if (ctx->submitter_task) { + if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, + ctx->notify_method)) + return; + } + mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } static struct io_rsrc_node *io_rsrc_node_alloc(void) diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 81445a477622..2b8743645efc 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -53,6 +53,7 @@ struct io_mapped_ubuf { struct bio_vec bvec[]; }; +void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_put_work(struct work_struct *work); void io_rsrc_refs_refill(struct io_ring_ctx *ctx); void io_wait_rsrc_data(struct io_rsrc_data *data); -- cgit v1.2.3 From 77e443ab294ca5b88896e8ddab41884948d5519a Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:33 +0000 Subject: io_uring: never run tw and fallback in parallel Once we fallback a tw we want all requests to that task to be given to the fallback wq so we dont run it in parallel with the last, i.e. post PF_EXITING, tw run of the task. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/96f4987265c4312f376f206511c6af3e77aaf5ac.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a29cbf973287..310cdd864a6c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -149,6 +149,7 @@ static void io_clean_op(struct io_kiocb *req); static void io_queue_sqe(struct io_kiocb *req); static void io_move_task_work_from_local(struct io_ring_ctx *ctx); static void __io_submit_flush_completions(struct io_ring_ctx *ctx); +static __cold void io_fallback_tw(struct io_uring_task *tctx); static struct kmem_cache *req_cachep; @@ -1149,10 +1150,17 @@ void tctx_task_work(struct callback_head *cb) struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); struct llist_node fake = {}; - struct llist_node *node = io_llist_xchg(&tctx->task_list, &fake); + struct llist_node *node; unsigned int loops = 1; - unsigned int count = handle_tw_list(node, &ctx, &uring_locked, NULL); + unsigned int count; + + if (unlikely(current->flags & PF_EXITING)) { + io_fallback_tw(tctx); + return; + } + node = io_llist_xchg(&tctx->task_list, &fake); + count = handle_tw_list(node, &ctx, &uring_locked, NULL); node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); while (node != &fake) { loops++; -- cgit v1.2.3 From 11373026f2960390d5e330df4e92735c4265c440 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:34 +0000 Subject: io_uring: get rid of double locking We don't need to take both uring_locks at once, msg_ring can be split in two parts, first getting a file from the filetable of the first ring and then installing it into the second one. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/a80ecc2bc99c3b3f2cf20015d618b7c51419a797.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 85 ++++++++++++++++++++++++++++++----------------------- io_uring/msg_ring.h | 1 + io_uring/opdef.c | 1 + 3 files changed, 51 insertions(+), 36 deletions(-) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index c7d6586164ca..387c45a58654 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -15,6 +15,7 @@ struct io_msg { struct file *file; + struct file *src_file; u64 user_data; u32 len; u32 cmd; @@ -23,6 +24,17 @@ struct io_msg { u32 flags; }; +void io_msg_ring_cleanup(struct io_kiocb *req) +{ + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + + if (WARN_ON_ONCE(!msg->src_file)) + return; + + fput(msg->src_file); + msg->src_file = NULL; +} + static int io_msg_ring_data(struct io_kiocb *req) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -37,17 +49,13 @@ static int io_msg_ring_data(struct io_kiocb *req) return -EOVERFLOW; } -static void io_double_unlock_ctx(struct io_ring_ctx *ctx, - struct io_ring_ctx *octx, +static void io_double_unlock_ctx(struct io_ring_ctx *octx, unsigned int issue_flags) { - if (issue_flags & IO_URING_F_UNLOCKED) - mutex_unlock(&ctx->uring_lock); mutex_unlock(&octx->uring_lock); } -static int io_double_lock_ctx(struct io_ring_ctx *ctx, - struct io_ring_ctx *octx, +static int io_double_lock_ctx(struct io_ring_ctx *octx, unsigned int issue_flags) { /* @@ -60,17 +68,28 @@ static int io_double_lock_ctx(struct io_ring_ctx *ctx, return -EAGAIN; return 0; } + mutex_lock(&octx->uring_lock); + return 0; +} - /* Always grab smallest value ctx first. We know ctx != octx. */ - if (ctx < octx) { - mutex_lock(&ctx->uring_lock); - mutex_lock(&octx->uring_lock); - } else { - mutex_lock(&octx->uring_lock); - mutex_lock(&ctx->uring_lock); +static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_ring_ctx *ctx = req->ctx; + struct file *file = NULL; + unsigned long file_ptr; + int idx = msg->src_fd; + + io_ring_submit_lock(ctx, issue_flags); + if (likely(idx < ctx->nr_user_files)) { + idx = array_index_nospec(idx, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, idx)->file_ptr; + file = (struct file *) (file_ptr & FFS_MASK); + if (file) + get_file(file); } - - return 0; + io_ring_submit_unlock(ctx, issue_flags); + return file; } static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) @@ -78,34 +97,27 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; - unsigned long file_ptr; - struct file *src_file; + struct file *src_file = msg->src_file; int ret; if (target_ctx == ctx) return -EINVAL; + if (!src_file) { + src_file = io_msg_grab_file(req, issue_flags); + if (!src_file) + return -EBADF; + msg->src_file = src_file; + req->flags |= REQ_F_NEED_CLEANUP; + } - ret = io_double_lock_ctx(ctx, target_ctx, issue_flags); - if (unlikely(ret)) - return ret; - - ret = -EBADF; - if (unlikely(msg->src_fd >= ctx->nr_user_files)) - goto out_unlock; - - msg->src_fd = array_index_nospec(msg->src_fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, msg->src_fd)->file_ptr; - if (!file_ptr) - goto out_unlock; - - src_file = (struct file *) (file_ptr & FFS_MASK); - get_file(src_file); + if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); - if (ret < 0) { - fput(src_file); + if (ret < 0) goto out_unlock; - } + msg->src_file = NULL; + req->flags &= ~REQ_F_NEED_CLEANUP; if (msg->flags & IORING_MSG_RING_CQE_SKIP) goto out_unlock; @@ -119,7 +131,7 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) ret = -EOVERFLOW; out_unlock: - io_double_unlock_ctx(ctx, target_ctx, issue_flags); + io_double_unlock_ctx(target_ctx, issue_flags); return ret; } @@ -130,6 +142,7 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; + msg->src_file = NULL; msg->user_data = READ_ONCE(sqe->off); msg->len = READ_ONCE(sqe->len); msg->cmd = READ_ONCE(sqe->addr); diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index fb9601f202d0..3987ee6c0e5f 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -2,3 +2,4 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); +void io_msg_ring_cleanup(struct io_kiocb *req); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 04dd2c983fce..3aa0d65c50e3 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -445,6 +445,7 @@ const struct io_op_def io_op_defs[] = { .name = "MSG_RING", .prep = io_msg_ring_prep, .issue = io_msg_ring, + .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, -- cgit v1.2.3 From 172113101641cf1f9628c528ec790cb809f2b704 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:35 +0000 Subject: io_uring: extract a io_msg_install_complete helper Extract a helper called io_msg_install_complete() from io_msg_send_fd(), will be used later. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1500ca1054cc4286a3ee1c60aacead57fcdfa02a.1670384893.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 387c45a58654..525063ac3dab 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -92,36 +92,25 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl return file; } -static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) +static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - struct io_ring_ctx *ctx = req->ctx; struct file *src_file = msg->src_file; int ret; - if (target_ctx == ctx) - return -EINVAL; - if (!src_file) { - src_file = io_msg_grab_file(req, issue_flags); - if (!src_file) - return -EBADF; - msg->src_file = src_file; - req->flags |= REQ_F_NEED_CLEANUP; - } - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); if (ret < 0) goto out_unlock; + msg->src_file = NULL; req->flags &= ~REQ_F_NEED_CLEANUP; if (msg->flags & IORING_MSG_RING_CQE_SKIP) goto out_unlock; - /* * If this fails, the target still received the file descriptor but * wasn't notified of the fact. This means that if this request @@ -135,6 +124,25 @@ out_unlock: return ret; } +static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + struct io_ring_ctx *ctx = req->ctx; + struct file *src_file = msg->src_file; + + if (target_ctx == ctx) + return -EINVAL; + if (!src_file) { + src_file = io_msg_grab_file(req, issue_flags); + if (!src_file) + return -EBADF; + msg->src_file = src_file; + req->flags |= REQ_F_NEED_CLEANUP; + } + return io_msg_install_complete(req, issue_flags); +} + int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); -- cgit v1.2.3 From 6d043ee1164ca3305738131f170e560587070fa9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 03:53:36 +0000 Subject: io_uring: do msg_ring in target task via tw While executing in a context of one io_uring instance msg_ring manipulates another ring. We're trying to keep CQEs posting contained in the context of the ring-owner task, use task_work to send the request to the target ring's task when we're modifying its CQ or trying to install a file. Note, we can't safely use io_uring task_work infra and have to use task_work directly. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/4d76c7b28ed5d71b520de4482fbb7f660f21cd80.1670384893.git.asml.silence@gmail.com [axboe: use TWA_SIGNAL_NO_IPI] Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 525063ac3dab..a1accf342079 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -16,6 +16,7 @@ struct io_msg { struct file *file; struct file *src_file; + struct callback_head tw; u64 user_data; u32 len; u32 cmd; @@ -35,6 +36,23 @@ void io_msg_ring_cleanup(struct io_kiocb *req) msg->src_file = NULL; } +static void io_msg_tw_complete(struct callback_head *head) +{ + struct io_msg *msg = container_of(head, struct io_msg, tw); + struct io_kiocb *req = cmd_to_io_kiocb(msg); + struct io_ring_ctx *target_ctx = req->file->private_data; + int ret = 0; + + if (current->flags & PF_EXITING) + ret = -EOWNERDEAD; + else if (!io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) + ret = -EOVERFLOW; + + if (ret < 0) + req_set_fail(req); + io_req_queue_tw_complete(req, ret); +} + static int io_msg_ring_data(struct io_kiocb *req) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -43,6 +61,15 @@ static int io_msg_ring_data(struct io_kiocb *req) if (msg->src_fd || msg->dst_fd || msg->flags) return -EINVAL; + if (target_ctx->task_complete && current != target_ctx->submitter_task) { + init_task_work(&msg->tw, io_msg_tw_complete); + if (task_work_add(target_ctx->submitter_task, &msg->tw, + TWA_SIGNAL_NO_IPI)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; + } + if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, 0)) return 0; @@ -124,6 +151,19 @@ out_unlock: return ret; } +static void io_msg_tw_fd_complete(struct callback_head *head) +{ + struct io_msg *msg = container_of(head, struct io_msg, tw); + struct io_kiocb *req = cmd_to_io_kiocb(msg); + int ret = -EOWNERDEAD; + + if (!(current->flags & PF_EXITING)) + ret = io_msg_install_complete(req, IO_URING_F_UNLOCKED); + if (ret < 0) + req_set_fail(req); + io_req_queue_tw_complete(req, ret); +} + static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; @@ -140,6 +180,15 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) msg->src_file = src_file; req->flags |= REQ_F_NEED_CLEANUP; } + + if (target_ctx->task_complete && current != target_ctx->submitter_task) { + init_task_work(&msg->tw, io_msg_tw_fd_complete); + if (task_work_add(target_ctx->submitter_task, &msg->tw, + TWA_SIGNAL)) + return -EOWNERDEAD; + + return IOU_ISSUE_SKIP_COMPLETE; + } return io_msg_install_complete(req, issue_flags); } @@ -185,10 +234,11 @@ int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) } done: - if (ret == -EAGAIN) - return -EAGAIN; - if (ret < 0) + if (ret < 0) { + if (ret == -EAGAIN || ret == IOU_ISSUE_SKIP_COMPLETE) + return ret; req_set_fail(req); + } io_req_set_res(req, ret, 0); return IOU_OK; } -- cgit v1.2.3 From f66f73421f0a929734bb41dde575e6d7859e548f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 7 Dec 2022 08:50:01 -0700 Subject: io_uring: skip spinlocking for ->task_complete ->task_complete was added to serialised CQE posting by doing it from the task context only (or fallback wq when the task is dead), and now we can use that to avoid taking ->completion_lock while filling CQ entries. The patch skips spinlocking only in two spots, __io_submit_flush_completions() and flushing in io_aux_cqe, it's safer and covers all cases we care about. Extra care is taken to force taking the lock while queueing overflow entries. It fundamentally relies on SINGLE_ISSUER to have only one task posting events. It also need to take into account overflowed CQEs, flushing of which happens in the cq wait path, and so this implementation also needs DEFER_TASKRUN to limit waiters. For the same reason we disable it for SQPOLL, and for IOPOLL as it won't benefit from it in any case. DEFER_TASKRUN, SQPOLL and IOPOLL requirement may be relaxed in the future. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2a8c91fd82cfcdcc1d2e5bac7051fe2c183bda73.1670384893.git.asml.silence@gmail.com [axboe: modify to apply] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 71 +++++++++++++++++++++++++++++++++++++++-------------- io_uring/io_uring.h | 10 +++++++- 2 files changed, 61 insertions(+), 20 deletions(-) (limited to 'io_uring') diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 310cdd864a6c..d367dbe1284f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -584,13 +584,25 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_eventfd_flush_signal(ctx); } +static inline void __io_cq_lock(struct io_ring_ctx *ctx) + __acquires(ctx->completion_lock) +{ + if (!ctx->task_complete) + spin_lock(&ctx->completion_lock); +} + +static inline void __io_cq_unlock(struct io_ring_ctx *ctx) +{ + if (!ctx->task_complete) + spin_unlock(&ctx->completion_lock); +} + /* keep it inlined for io_submit_flush_completions() */ -static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx) +static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { io_commit_cqring(ctx); - spin_unlock(&ctx->completion_lock); - + __io_cq_unlock(ctx); io_commit_cqring_flush(ctx); io_cqring_wake(ctx); } @@ -598,7 +610,10 @@ static inline void io_cq_unlock_post_inline(struct io_ring_ctx *ctx) void io_cq_unlock_post(struct io_ring_ctx *ctx) __releases(ctx->completion_lock) { - io_cq_unlock_post_inline(ctx); + io_commit_cqring(ctx); + spin_unlock(&ctx->completion_lock); + io_commit_cqring_flush(ctx); + io_cqring_wake(ctx); } /* Returns true if there are no backlogged entries after the flush */ @@ -785,12 +800,13 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) return &rings->cqes[off]; } -static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, - bool allow_overflow) +static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, + u32 cflags) { struct io_uring_cqe *cqe; - lockdep_assert_held(&ctx->completion_lock); + if (!ctx->task_complete) + lockdep_assert_held(&ctx->completion_lock); ctx->cq_extra++; @@ -813,10 +829,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 } return true; } - - if (allow_overflow) - return io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - return false; } @@ -830,7 +842,17 @@ static void __io_flush_post_cqes(struct io_ring_ctx *ctx) for (i = 0; i < state->cqes_count; i++) { struct io_uring_cqe *cqe = &state->cqes[i]; - io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags, true); + if (!io_fill_cqe_aux(ctx, cqe->user_data, cqe->res, cqe->flags)) { + if (ctx->task_complete) { + spin_lock(&ctx->completion_lock); + io_cqring_event_overflow(ctx, cqe->user_data, + cqe->res, cqe->flags, 0, 0); + spin_unlock(&ctx->completion_lock); + } else { + io_cqring_event_overflow(ctx, cqe->user_data, + cqe->res, cqe->flags, 0, 0); + } + } } state->cqes_count = 0; } @@ -841,7 +863,10 @@ static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u bool filled; io_cq_lock(ctx); - filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + if (!filled && allow_overflow) + filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + io_cq_unlock_post(ctx); return filled; } @@ -865,10 +890,10 @@ bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 lockdep_assert_held(&ctx->uring_lock); if (ctx->submit_state.cqes_count == length) { - io_cq_lock(ctx); + __io_cq_lock(ctx); __io_flush_post_cqes(ctx); /* no need to flush - flush is deferred */ - io_cq_unlock(ctx); + __io_cq_unlock_post(ctx); } /* For defered completions this is not as strict as it is otherwise, @@ -1403,7 +1428,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - io_cq_lock(ctx); + __io_cq_lock(ctx); /* must come first to preserve CQE ordering in failure cases */ if (state->cqes_count) __io_flush_post_cqes(ctx); @@ -1411,10 +1436,18 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP)) - __io_fill_cqe_req(ctx, req); + if (!(req->flags & REQ_F_CQE_SKIP) && + unlikely(!__io_fill_cqe_req(ctx, req))) { + if (ctx->task_complete) { + spin_lock(&ctx->completion_lock); + io_req_cqe_overflow(req); + spin_unlock(&ctx->completion_lock); + } else { + io_req_cqe_overflow(req); + } + } } - io_cq_unlock_post_inline(ctx); + __io_cq_unlock_post(ctx); if (!wq_list_empty(&ctx->submit_state.compl_reqs)) { io_free_batch_list(ctx, state->compl_reqs.first); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2277c05f52a6..1b2f0b2cc888 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -133,7 +133,7 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, */ cqe = io_get_cqe(ctx); if (unlikely(!cqe)) - return io_req_cqe_overflow(req); + return false; trace_io_uring_complete(req->ctx, req, req->cqe.user_data, req->cqe.res, req->cqe.flags, @@ -156,6 +156,14 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, return true; } +static inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + if (likely(__io_fill_cqe_req(ctx, req))) + return true; + return io_req_cqe_overflow(req); +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; -- cgit v1.2.3 From 761c61c15903db41343532882b0443addb8c2faf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 8 Dec 2022 09:36:02 -0700 Subject: io_uring/msg_ring: flag target ring as having task_work, if needed Before the recent change, we didn't even wake the targeted task when posting the cqe remotely. Now we do wake it, but we do want to ensure that the recipient knows there's potential work there that needs to get processed to get the CQE posted. OR in IORING_SQ_TASKRUN for that purpose. Link: https://lore.kernel.org/io-uring/2843c6b4-ba9a-b67d-e0f4-957f42098489@kernel.dk/ Fixes: 6d043ee1164c ("io_uring: do msg_ring in target task via tw") Signed-off-by: Jens Axboe --- io_uring/msg_ring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'io_uring') diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index a1accf342079..2d3cd945a531 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -67,6 +67,7 @@ static int io_msg_ring_data(struct io_kiocb *req) TWA_SIGNAL_NO_IPI)) return -EOWNERDEAD; + atomic_or(IORING_SQ_TASKRUN, &target_ctx->rings->sq_flags); return IOU_ISSUE_SKIP_COMPLETE; } -- cgit v1.2.3