summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/filetable.c2
-rw-r--r--io_uring/io_uring.c4
-rw-r--r--io_uring/io_uring.h9
-rw-r--r--io_uring/net.c14
-rw-r--r--io_uring/poll.c47
-rw-r--r--io_uring/rw.c10
-rw-r--r--io_uring/xattr.c8
7 files changed, 65 insertions, 29 deletions
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 7b473259f3f4..68dfc6936aa7 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -101,8 +101,6 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
err:
if (needs_switch)
io_rsrc_node_switch(ctx, ctx->file_data);
- if (ret)
- fput(file);
return ret;
}
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index adecdf65b130..3436e0b83534 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2750,8 +2750,10 @@ static __cold void io_tctx_exit_cb(struct callback_head *cb)
/*
* When @in_idle, we're in cancellation and it's racy to remove the
* node. It'll be removed by the end of cancellation, just ignore it.
+ * tctx can be NULL if the queueing of this task_work raced with
+ * work cancelation off the exec path.
*/
- if (!atomic_read(&tctx->in_idle))
+ if (tctx && !atomic_read(&tctx->in_idle))
io_uring_del_tctx_node((unsigned long)work->ctx);
complete(&work->completion);
}
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 76659d2fc90c..062899b1fe86 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -246,9 +246,14 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
static inline int io_run_task_work(void)
{
+ /*
+ * Always check-and-clear the task_work notification signal. With how
+ * signaling works for task_work, we can find it set with nothing to
+ * run. We need to clear it for that case, like get_signal() does.
+ */
+ if (test_thread_flag(TIF_NOTIFY_SIGNAL))
+ clear_notify_signal();
if (task_work_pending(current)) {
- if (test_thread_flag(TIF_NOTIFY_SIGNAL))
- clear_notify_signal();
__set_current_state(TASK_RUNNING);
task_work_run();
return 1;
diff --git a/io_uring/net.c b/io_uring/net.c
index 90342dcb6b1d..cb831326ea5b 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -365,7 +365,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!sock))
return -ENOTSOCK;
- ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+ ret = import_single_range(ITER_SOURCE, sr->buf, sr->len, &iov, &msg.msg_iter);
if (unlikely(ret))
return ret;
@@ -451,7 +451,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
}
} else {
iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
+ ret = __import_iovec(ITER_DEST, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
&iomsg->free_iov, &iomsg->msg.msg_iter,
false);
if (ret > 0)
@@ -503,7 +503,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
}
} else {
iomsg->free_iov = iomsg->fast_iov;
- ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
+ ret = __import_iovec(ITER_DEST, (struct iovec __user *)uiov, msg.msg_iovlen,
UIO_FASTIOV, &iomsg->free_iov,
&iomsg->msg.msg_iter, true);
if (ret < 0)
@@ -749,7 +749,7 @@ retry_multishot:
kmsg->fast_iov[0].iov_base = buf;
kmsg->fast_iov[0].iov_len = len;
- iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+ iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, kmsg->fast_iov, 1,
len);
}
@@ -843,7 +843,7 @@ retry_multishot:
sr->buf = buf;
}
- ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
+ ret = import_single_range(ITER_DEST, sr->buf, len, &iov, &msg.msg_iter);
if (unlikely(ret))
goto out_free;
@@ -1089,14 +1089,14 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
return io_setup_async_addr(req, &__address, issue_flags);
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
- ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
+ ret = io_import_fixed(ITER_SOURCE, &msg.msg_iter, req->imu,
(u64)(uintptr_t)zc->buf, zc->len);
if (unlikely(ret))
return ret;
msg.sg_from_iter = io_sg_from_iter;
} else {
io_notif_set_extended(zc->notif);
- ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
+ ret = import_single_range(ITER_SOURCE, zc->buf, zc->len, &iov,
&msg.msg_iter);
if (unlikely(ret))
return ret;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 922c1a366c41..599ba28c89b2 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -40,7 +40,14 @@ struct io_poll_table {
};
#define IO_POLL_CANCEL_FLAG BIT(31)
-#define IO_POLL_REF_MASK GENMASK(30, 0)
+#define IO_POLL_RETRY_FLAG BIT(30)
+#define IO_POLL_REF_MASK GENMASK(29, 0)
+
+/*
+ * We usually have 1-2 refs taken, 128 is more than enough and we want to
+ * maximise the margin between this amount and the moment when it overflows.
+ */
+#define IO_POLL_REF_BIAS 128
#define IO_WQE_F_DOUBLE 1
@@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe)
return priv & IO_WQE_F_DOUBLE;
}
+static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
+{
+ int v;
+
+ /*
+ * poll_refs are already elevated and we don't have much hope for
+ * grabbing the ownership. Instead of incrementing set a retry flag
+ * to notify the loop that there might have been some change.
+ */
+ v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs);
+ if (v & IO_POLL_REF_MASK)
+ return false;
+ return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
+}
+
/*
* If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can
* bump it and acquire ownership. It's disallowed to modify requests while not
@@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe)
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
+ if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+ return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
@@ -235,6 +259,16 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
*/
if ((v & IO_POLL_REF_MASK) != 1)
req->cqe.res = 0;
+ if (v & IO_POLL_RETRY_FLAG) {
+ req->cqe.res = 0;
+ /*
+ * We won't find new events that came in between
+ * vfs_poll and the ref put unless we clear the flag
+ * in advance.
+ */
+ atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
+ v &= ~IO_POLL_RETRY_FLAG;
+ }
/* the mask was stashed in __io_poll_execute */
if (!req->cqe.res) {
@@ -272,7 +306,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked)
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
- } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs));
+ } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) &
+ IO_POLL_REF_MASK);
return IOU_POLL_NO_ACTION;
}
@@ -520,7 +555,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
unsigned issue_flags)
{
struct io_ring_ctx *ctx = req->ctx;
- int v;
INIT_HLIST_NODE(&req->hash_node);
req->work.cancel_seq = atomic_read(&ctx->cancel_seq);
@@ -588,11 +622,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req,
if (ipt->owning) {
/*
- * Release ownership. If someone tried to queue a tw while it was
- * locked, kick it off for them.
+ * Try to release ownership. If we see a change of state, e.g.
+ * poll was waken up, queue up a tw, it'll deal with it.
*/
- v = atomic_dec_return(&req->poll_refs);
- if (unlikely(v & IO_POLL_REF_MASK))
+ if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1)
__io_poll_execute(req, 0);
}
return 0;
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1ce065709724..77576835a848 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -554,12 +554,12 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw)
int io_readv_prep_async(struct io_kiocb *req)
{
- return io_rw_prep_async(req, READ);
+ return io_rw_prep_async(req, ITER_DEST);
}
int io_writev_prep_async(struct io_kiocb *req)
{
- return io_rw_prep_async(req, WRITE);
+ return io_rw_prep_async(req, ITER_SOURCE);
}
/*
@@ -710,7 +710,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
loff_t *ppos;
if (!req_has_async_data(req)) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
} else {
@@ -722,7 +722,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
* buffers, as we dropped the selected one before retry.
*/
if (io_do_buffer_select(req)) {
- ret = io_import_iovec(READ, req, &iovec, s, issue_flags);
+ ret = io_import_iovec(ITER_DEST, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
}
@@ -857,7 +857,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
loff_t *ppos;
if (!req_has_async_data(req)) {
- ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags);
+ ret = io_import_iovec(ITER_SOURCE, req, &iovec, s, issue_flags);
if (unlikely(ret < 0))
return ret;
} else {
diff --git a/io_uring/xattr.c b/io_uring/xattr.c
index 99df641594d7..6201a9f442c6 100644
--- a/io_uring/xattr.c
+++ b/io_uring/xattr.c
@@ -112,7 +112,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags)
if (issue_flags & IO_URING_F_NONBLOCK)
return -EAGAIN;
- ret = do_getxattr(mnt_user_ns(req->file->f_path.mnt),
+ ret = do_getxattr(mnt_idmap(req->file->f_path.mnt),
req->file->f_path.dentry,
&ix->ctx);
@@ -133,9 +133,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags)
retry:
ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL);
if (!ret) {
- ret = do_getxattr(mnt_user_ns(path.mnt),
- path.dentry,
- &ix->ctx);
+ ret = do_getxattr(mnt_idmap(path.mnt), path.dentry, &ix->ctx);
path_put(&path);
if (retry_estale(ret, lookup_flags)) {
@@ -213,7 +211,7 @@ static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
ret = mnt_want_write(path->mnt);
if (!ret) {
- ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+ ret = do_setxattr(mnt_idmap(path->mnt), path->dentry, &ix->ctx);
mnt_drop_write(path->mnt);
}