summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c265
1 files changed, 208 insertions, 57 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 493e5047e67c..26978630378e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -669,12 +669,12 @@ struct io_kiocb {
* restore the work, if needed.
*/
struct {
- struct callback_head task_work;
struct hlist_node hash_node;
struct async_poll *apoll;
};
struct io_wq_work work;
};
+ struct callback_head task_work;
};
#define IO_PLUG_THRESHOLD 2
@@ -1549,12 +1549,9 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
/*
* Called if REQ_F_LINK_HEAD is set, and we fail the head request
*/
-static void io_fail_links(struct io_kiocb *req)
+static void __io_fail_links(struct io_kiocb *req)
{
struct io_ring_ctx *ctx = req->ctx;
- unsigned long flags;
-
- spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) {
struct io_kiocb *link = list_first_entry(&req->link_list,
@@ -1568,13 +1565,29 @@ static void io_fail_links(struct io_kiocb *req)
io_link_cancel_timeout(link);
} else {
io_cqring_fill_event(link, -ECANCELED);
+ link->flags |= REQ_F_COMP_LOCKED;
__io_double_put_req(link);
}
req->flags &= ~REQ_F_LINK_TIMEOUT;
}
io_commit_cqring(ctx);
- spin_unlock_irqrestore(&ctx->completion_lock, flags);
+}
+
+static void io_fail_links(struct io_kiocb *req)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+
+ if (!(req->flags & REQ_F_COMP_LOCKED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->completion_lock, flags);
+ __io_fail_links(req);
+ spin_unlock_irqrestore(&ctx->completion_lock, flags);
+ } else {
+ __io_fail_links(req);
+ }
+
io_cqring_ev_posted(ctx);
}
@@ -1747,6 +1760,17 @@ static int io_put_kbuf(struct io_kiocb *req)
return cflags;
}
+static inline bool io_run_task_work(void)
+{
+ if (current->task_works) {
+ __set_current_state(TASK_RUNNING);
+ task_work_run();
+ return true;
+ }
+
+ return false;
+}
+
static void io_iopoll_queue(struct list_head *again)
{
struct io_kiocb *req;
@@ -1936,6 +1960,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
*/
if (!(++iters & 7)) {
mutex_unlock(&ctx->uring_lock);
+ io_run_task_work();
mutex_lock(&ctx->uring_lock);
}
@@ -2661,8 +2686,10 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
if (req->file->f_op->read_iter)
ret2 = call_read_iter(req->file, kiocb, &iter);
- else
+ else if (req->file->f_op->read)
ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+ else
+ ret2 = -EINVAL;
/* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) {
@@ -2776,8 +2803,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
if (req->file->f_op->write_iter)
ret2 = call_write_iter(req->file, kiocb, &iter);
- else
+ else if (req->file->f_op->write)
ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+ else
+ ret2 = -EINVAL;
if (!force_nonblock)
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
@@ -4088,22 +4117,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
{
struct task_struct *tsk = req->task;
struct io_ring_ctx *ctx = req->ctx;
- int ret, notify = TWA_RESUME;
+ int ret, notify;
/*
- * SQPOLL kernel thread doesn't need notification, just a wakeup.
- * If we're not using an eventfd, then TWA_RESUME is always fine,
- * as we won't have dependencies between request completions for
- * other kernel wait conditions.
+ * SQPOLL kernel thread doesn't need notification, just a wakeup. For
+ * all other cases, use TWA_SIGNAL unconditionally to ensure we're
+ * processing task_work. There's no reliable way to tell if TWA_RESUME
+ * will do the job.
*/
- if (ctx->flags & IORING_SETUP_SQPOLL)
- notify = 0;
- else if (ctx->cq_ev_fd)
+ notify = 0;
+ if (!(ctx->flags & IORING_SETUP_SQPOLL))
notify = TWA_SIGNAL;
ret = task_work_add(tsk, cb, notify);
if (!ret)
wake_up_process(tsk);
+
return ret;
}
@@ -4124,6 +4153,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
tsk = req->task;
req->result = mask;
init_task_work(&req->task_work, func);
+ percpu_ref_get(&req->ctx->refs);
+
/*
* If this fails, then the task is exiting. When a task exits, the
* work gets canceled, so just cancel this request as well instead
@@ -4160,9 +4191,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll)
return false;
}
-static void io_poll_remove_double(struct io_kiocb *req, void *data)
+static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req)
+{
+ /* pure poll stashes this in ->io, poll driven retry elsewhere */
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return (struct io_poll_iocb *) req->io;
+ return req->apoll->double_poll;
+}
+
+static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req)
+{
+ if (req->opcode == IORING_OP_POLL_ADD)
+ return &req->poll;
+ return &req->apoll->poll;
+}
+
+static void io_poll_remove_double(struct io_kiocb *req)
{
- struct io_poll_iocb *poll = data;
+ struct io_poll_iocb *poll = io_poll_get_double(req);
lockdep_assert_held(&req->ctx->completion_lock);
@@ -4182,7 +4228,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error)
{
struct io_ring_ctx *ctx = req->ctx;
- io_poll_remove_double(req, req->io);
+ io_poll_remove_double(req);
req->poll.done = true;
io_cqring_fill_event(req, error ? error : mangle_poll(mask));
io_commit_cqring(ctx);
@@ -4208,6 +4254,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
static void io_poll_task_func(struct callback_head *cb)
{
struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+ struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt = NULL;
io_poll_task_handler(req, &nxt);
@@ -4218,13 +4265,14 @@ static void io_poll_task_func(struct callback_head *cb)
__io_queue_sqe(nxt, NULL);
mutex_unlock(&ctx->uring_lock);
}
+ percpu_ref_put(&ctx->refs);
}
static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
int sync, void *key)
{
struct io_kiocb *req = wait->private;
- struct io_poll_iocb *poll = req->apoll->double_poll;
+ struct io_poll_iocb *poll = io_poll_get_single(req);
__poll_t mask = key_to_poll(key);
/* for instances that support it check for an event match first: */
@@ -4238,6 +4286,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,
done = list_empty(&poll->wait.entry);
if (!done)
list_del_init(&poll->wait.entry);
+ /* make sure double remove sees this as being gone */
+ wait->private = NULL;
spin_unlock(&poll->head->lock);
if (!done)
__io_async_wake(req, poll, mask, io_poll_task_func);
@@ -4313,7 +4363,8 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
struct io_kiocb *req)
{
if (io_op_defs[req->opcode].needs_mm && !current->mm) {
- if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+ if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+ !mmget_not_zero(ctx->sqo_mm)))
return -EFAULT;
kthread_use_mm(ctx->sqo_mm);
}
@@ -4332,6 +4383,7 @@ static void io_async_task_func(struct callback_head *cb)
if (io_poll_rewait(req, &apoll->poll)) {
spin_unlock_irq(&ctx->completion_lock);
+ percpu_ref_put(&ctx->refs);
return;
}
@@ -4346,7 +4398,7 @@ static void io_async_task_func(struct callback_head *cb)
}
}
- io_poll_remove_double(req, apoll->double_poll);
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
/* restore ->work in case we need to retry again */
@@ -4356,7 +4408,6 @@ static void io_async_task_func(struct callback_head *cb)
kfree(apoll);
if (!canceled) {
- __set_current_state(TASK_RUNNING);
if (io_sq_thread_acquire_mm(ctx, req)) {
io_cqring_add_event(req, -EFAULT);
goto end_req;
@@ -4370,6 +4421,7 @@ end_req:
req_set_fail_links(req);
io_double_put_req(req);
}
+ percpu_ref_put(&ctx->refs);
}
static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -4472,8 +4524,8 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask,
io_async_wake);
- if (ret) {
- io_poll_remove_double(req, apoll->double_poll);
+ if (ret || ipt.error) {
+ io_poll_remove_double(req);
spin_unlock_irq(&ctx->completion_lock);
if (req->flags & REQ_F_WORK_INITIALIZED)
memcpy(&req->work, &apoll->work, sizeof(req->work));
@@ -4507,14 +4559,13 @@ static bool io_poll_remove_one(struct io_kiocb *req)
{
bool do_complete;
+ io_poll_remove_double(req);
+
if (req->opcode == IORING_OP_POLL_ADD) {
- io_poll_remove_double(req, req->io);
do_complete = __io_poll_remove_one(req, &req->poll);
} else {
struct async_poll *apoll = req->apoll;
- io_poll_remove_double(req, apoll->double_poll);
-
/* non-poll requests have submit ref still */
do_complete = __io_poll_remove_one(req, &apoll->poll);
if (do_complete) {
@@ -4536,6 +4587,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(req->ctx);
req->flags |= REQ_F_COMP_LOCKED;
+ req_set_fail_links(req);
io_put_req(req);
}
@@ -4709,6 +4761,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+static int __io_timeout_cancel(struct io_kiocb *req)
+{
+ int ret;
+
+ list_del_init(&req->list);
+
+ ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
+ if (ret == -1)
+ return -EALREADY;
+
+ req_set_fail_links(req);
+ req->flags |= REQ_F_COMP_LOCKED;
+ io_cqring_fill_event(req, -ECANCELED);
+ io_put_req(req);
+ return 0;
+}
+
static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
{
struct io_kiocb *req;
@@ -4716,7 +4785,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
list_for_each_entry(req, &ctx->timeout_list, list) {
if (user_data == req->user_data) {
- list_del_init(&req->list);
ret = 0;
break;
}
@@ -4725,14 +4793,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT)
return ret;
- ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
- if (ret == -1)
- return -EALREADY;
-
- req_set_fail_links(req);
- io_cqring_fill_event(req, -ECANCELED);
- io_put_req(req);
- return 0;
+ return __io_timeout_cancel(req);
}
static int io_timeout_remove_prep(struct io_kiocb *req,
@@ -6082,8 +6143,7 @@ static int io_sq_thread(void *data)
if (!list_empty(&ctx->poll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
cond_resched();
continue;
}
@@ -6115,8 +6175,7 @@ static int io_sq_thread(void *data)
finish_wait(&ctx->sqo_wait, &wait);
break;
}
- if (current->task_works) {
- task_work_run();
+ if (io_run_task_work()) {
finish_wait(&ctx->sqo_wait, &wait);
continue;
}
@@ -6145,8 +6204,7 @@ static int io_sq_thread(void *data)
timeout = jiffies + ctx->sq_thread_idle;
}
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
io_sq_thread_drop_mm(ctx);
revert_creds(old_cred);
@@ -6211,9 +6269,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
if (io_cqring_events(ctx, false) >= min_events)
return 0;
- if (!current->task_works)
+ if (!io_run_task_work())
break;
- task_work_run();
} while (1);
if (sig) {
@@ -6235,8 +6292,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
/* make sure we run task_work before checking for signals */
- if (current->task_works)
- task_work_run();
+ if (io_run_task_work())
+ continue;
if (signal_pending(current)) {
if (current->jobctl & JOBCTL_TASK_WORK) {
spin_lock_irq(&current->sighand->siglock);
@@ -7086,6 +7143,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
return SIZE_MAX;
#endif
+ if (sq_offset)
+ *sq_offset = off;
+
sq_array_size = array_size(sizeof(u32), sq_entries);
if (sq_array_size == SIZE_MAX)
return SIZE_MAX;
@@ -7093,9 +7153,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
if (check_add_overflow(off, sq_array_size, &off))
return SIZE_MAX;
- if (sq_offset)
- *sq_offset = off;
-
return off;
}
@@ -7488,6 +7545,98 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data)
return work->files == files;
}
+/*
+ * Returns true if 'preq' is the link parent of 'req'
+ */
+static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req)
+{
+ struct io_kiocb *link;
+
+ if (!(preq->flags & REQ_F_LINK_HEAD))
+ return false;
+
+ list_for_each_entry(link, &preq->link_list, link_list) {
+ if (link == req)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * We're looking to cancel 'req' because it's holding on to our files, but
+ * 'req' could be a link to another request. See if it is, and cancel that
+ * parent request if so.
+ */
+static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ struct hlist_node *tmp;
+ struct io_kiocb *preq;
+ bool found = false;
+ int i;
+
+ spin_lock_irq(&ctx->completion_lock);
+ for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
+ struct hlist_head *list;
+
+ list = &ctx->cancel_hash[i];
+ hlist_for_each_entry_safe(preq, tmp, list, hash_node) {
+ found = io_match_link(preq, req);
+ if (found) {
+ io_poll_remove_one(preq);
+ break;
+ }
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
+static bool io_timeout_remove_link(struct io_ring_ctx *ctx,
+ struct io_kiocb *req)
+{
+ struct io_kiocb *preq;
+ bool found = false;
+
+ spin_lock_irq(&ctx->completion_lock);
+ list_for_each_entry(preq, &ctx->timeout_list, list) {
+ found = io_match_link(preq, req);
+ if (found) {
+ __io_timeout_cancel(preq);
+ break;
+ }
+ }
+ spin_unlock_irq(&ctx->completion_lock);
+ return found;
+}
+
+static bool io_cancel_link_cb(struct io_wq_work *work, void *data)
+{
+ return io_match_link(container_of(work, struct io_kiocb, work), data);
+}
+
+static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+ enum io_wq_cancel cret;
+
+ /* cancel this particular work, if it's running */
+ cret = io_wq_cancel_work(ctx->io_wq, &req->work);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ return;
+
+ /* find links that hold this pending, cancel those */
+ cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true);
+ if (cret != IO_WQ_CANCEL_NOTFOUND)
+ return;
+
+ /* if we have a poll link holding this pending, cancel that */
+ if (io_poll_remove_link(ctx, req))
+ return;
+
+ /* final option, timeout link is holding this req pending */
+ io_timeout_remove_link(ctx, req);
+}
+
static void io_uring_cancel_files(struct io_ring_ctx *ctx,
struct files_struct *files)
{
@@ -7529,10 +7678,10 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
clear_bit(0, &ctx->cq_check_overflow);
ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
- spin_unlock_irq(&ctx->completion_lock);
-
WRITE_ONCE(ctx->rings->cq_overflow,
atomic_inc_return(&ctx->cached_cq_overflow));
+ io_commit_cqring(ctx);
+ spin_unlock_irq(&ctx->completion_lock);
/*
* Put inflight ref and overflow ref. If that's
@@ -7544,7 +7693,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
continue;
}
} else {
- io_wq_cancel_work(ctx->io_wq, &cancel_req->work);
+ /* cancel this request, or head link requests */
+ io_attempt_cancel(ctx, cancel_req);
io_put_req(cancel_req);
}
@@ -7655,8 +7805,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
int submitted = 0;
struct fd f;
- if (current->task_works)
- task_work_run();
+ io_run_task_work();
if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
return -EINVAL;
@@ -7828,6 +7977,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_rings *rings;
size_t size, sq_array_offset;
+ /* make sure these are sane, as we already accounted them */
+ ctx->sq_entries = p->sq_entries;
+ ctx->cq_entries = p->cq_entries;
+
size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset);
if (size == SIZE_MAX)
return -EOVERFLOW;
@@ -7844,8 +7997,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
rings->cq_ring_entries = p->cq_entries;
ctx->sq_mask = rings->sq_ring_mask;
ctx->cq_mask = rings->cq_ring_mask;
- ctx->sq_entries = rings->sq_ring_entries;
- ctx->cq_entries = rings->cq_ring_entries;
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) {