From 7701efa45b5bd5c28e09ada3a446c3c7ca7fe62b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 12:48:13 -0400 Subject: io_uring: abstract out task work running [ Upstream commit 4c6e277c4cc4a6b3b2b9c66a7b014787ae757cc1 ] Provide a helper to run task_work instead of checking and running manually in a bunch of different spots. While doing so, also move the task run state setting where we run the task work. Then we can move it out of the callback helpers. This also helps ensure we only do this once per task_work list run, not per task_work item. Suggested-by: Oleg Nesterov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 493e5047e67c..95bacab047dd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1747,6 +1747,17 @@ static int io_put_kbuf(struct io_kiocb *req) return cflags; } +static inline bool io_run_task_work(void) +{ + if (current->task_works) { + __set_current_state(TASK_RUNNING); + task_work_run(); + return true; + } + + return false; +} + static void io_iopoll_queue(struct list_head *again) { struct io_kiocb *req; @@ -1936,6 +1947,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); + io_run_task_work(); mutex_lock(&ctx->uring_lock); } @@ -4356,7 +4368,6 @@ static void io_async_task_func(struct callback_head *cb) kfree(apoll); if (!canceled) { - __set_current_state(TASK_RUNNING); if (io_sq_thread_acquire_mm(ctx, req)) { io_cqring_add_event(req, -EFAULT); goto end_req; @@ -6082,8 +6093,7 @@ static int io_sq_thread(void *data) if (!list_empty(&ctx->poll_list) || need_resched() || (!time_after(jiffies, timeout) && ret != -EBUSY && !percpu_ref_is_dying(&ctx->refs))) { - if (current->task_works) - task_work_run(); + io_run_task_work(); cond_resched(); continue; } @@ -6115,8 +6125,7 @@ static int io_sq_thread(void *data) finish_wait(&ctx->sqo_wait, &wait); break; } - if (current->task_works) { - task_work_run(); + if (io_run_task_work()) { finish_wait(&ctx->sqo_wait, &wait); continue; } @@ -6145,8 +6154,7 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (current->task_works) - task_work_run(); + io_run_task_work(); io_sq_thread_drop_mm(ctx); revert_creds(old_cred); @@ -6211,9 +6219,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, do { if (io_cqring_events(ctx, false) >= min_events) return 0; - if (!current->task_works) + if (!io_run_task_work()) break; - task_work_run(); } while (1); if (sig) { @@ -6235,8 +6242,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, TASK_INTERRUPTIBLE); /* make sure we run task_work before checking for signals */ - if (current->task_works) - task_work_run(); + if (io_run_task_work()) + continue; if (signal_pending(current)) { if (current->jobctl & JOBCTL_TASK_WORK) { spin_lock_irq(¤t->sighand->siglock); @@ -7655,8 +7662,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (current->task_works) - task_work_run(); + io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; -- cgit v1.2.3 From ff316db596a86fce8728f86c1f4d6dc0d6f84200 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Sat, 11 Jul 2020 11:31:11 +0200 Subject: io_uring: fix sq array offset calculation [ Upstream commit b36200f543ff07a1cb346aa582349141df2c8068 ] rings_size() sets sq_offset to the total size of the rings (the returned value which is used for memory allocation). This is wrong: sq array should be located within the rings, not after them. Set sq_offset to where it should be. Fixes: 75b28affdd6a ("io_uring: allocate the two rings together") Signed-off-by: Dmitry Vyukov Acked-by: Hristo Venev Cc: io-uring@vger.kernel.org Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 95bacab047dd..8503aec7ea29 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7093,6 +7093,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, return SIZE_MAX; #endif + if (sq_offset) + *sq_offset = off; + sq_array_size = array_size(sizeof(u32), sq_entries); if (sq_array_size == SIZE_MAX) return SIZE_MAX; @@ -7100,9 +7103,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries, if (check_add_overflow(off, sq_array_size, &off)) return SIZE_MAX; - if (sq_offset) - *sq_offset = off; - return off; } -- cgit v1.2.3 From 5a4f7281d5a61e7330a43ef13b0bb5fa5f8279c4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 27 Jun 2020 14:04:59 +0300 Subject: io_uring: fix req->work corruption [ Upstream commit 8ef77766ba8694968ed4ba24311b4bacee14f235 ] req->work and req->task_work are in a union, so io_req_task_queue() screws everything that was in work. De-union them for now. [ 704.367253] BUG: unable to handle page fault for address: ffffffffaf7330d0 [ 704.367256] #PF: supervisor write access in kernel mode [ 704.367256] #PF: error_code(0x0003) - permissions violation [ 704.367261] CPU: 6 PID: 1654 Comm: io_wqe_worker-0 Tainted: G I 5.8.0-rc2-00038-ge28d0bdc4863-dirty #498 [ 704.367265] RIP: 0010:_raw_spin_lock+0x1e/0x36 ... [ 704.367276] __alloc_fd+0x35/0x150 [ 704.367279] __get_unused_fd_flags+0x25/0x30 [ 704.367280] io_openat2+0xcb/0x1b0 [ 704.367283] io_issue_sqe+0x36a/0x1320 [ 704.367294] io_wq_submit_work+0x58/0x160 [ 704.367295] io_worker_handle_work+0x2a3/0x430 [ 704.367296] io_wqe_worker+0x2a0/0x350 [ 704.367301] kthread+0x136/0x180 [ 704.367304] ret_from_fork+0x22/0x30 Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 8503aec7ea29..d732566955d3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -669,12 +669,12 @@ struct io_kiocb { * restore the work, if needed. */ struct { - struct callback_head task_work; struct hlist_node hash_node; struct async_poll *apoll; }; struct io_wq_work work; }; + struct callback_head task_work; }; #define IO_PLUG_THRESHOLD 2 -- cgit v1.2.3 From 458ae14d4ce589e7201be151f6b6a0151907985d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:47 +0300 Subject: io_uring: fix racy overflow count reporting [ Upstream commit b2bd1cf99f3e7c8fbf12ea07af2c6998e1209e25 ] All ->cq_overflow modifications should be under completion_lock, otherwise it can report a wrong number to the userspace. Fix it in io_uring_cancel_files(). Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index d732566955d3..1d8761a9f3b8 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7536,10 +7536,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, clear_bit(0, &ctx->cq_check_overflow); ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; } - spin_unlock_irq(&ctx->completion_lock); - WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + spin_unlock_irq(&ctx->completion_lock); /* * Put inflight ref and overflow ref. If that's -- cgit v1.2.3 From d5c9f20e4ad771bcc07a89c8109de1719fc576b2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 30 Jul 2020 18:43:48 +0300 Subject: io_uring: fix stalled deferred requests [ Upstream commit dd9dfcdf5a603680458f5e7b0d2273c66e5417db ] Always do io_commit_cqring() after completing a request, even if it was accounted as overflowed on the CQ side. Failing to do that may lead to not to pushing deferred requests when needed, and so stalling the whole ring. Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1d8761a9f3b8..1619ca74b44d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7538,6 +7538,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } WRITE_ONCE(ctx->rings->cq_overflow, atomic_inc_return(&ctx->cached_cq_overflow)); + io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); /* -- cgit v1.2.3 From 93bc88750e69a594a56dbbb341171ec589d72cca Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 5 Aug 2020 12:58:23 -0600 Subject: io_uring: set ctx sq/cq entry count earlier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit bd74048108c179cea0ff52979506164c80f29da7 upstream. If we hit an earlier error path in io_uring_create(), then we will have accounted memory, but not set ctx->{sq,cq}_entries yet. Then when the ring is torn down in error, we use those values to unaccount the memory. Ensure we set the ctx entries before we're able to hit a potential error path. Cc: stable@vger.kernel.org Reported-by: Tomáš Chaloupka Tested-by: Tomáš Chaloupka Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 1619ca74b44d..98c99e62a628 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7834,6 +7834,10 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_rings *rings; size_t size, sq_array_offset; + /* make sure these are sane, as we already accounted them */ + ctx->sq_entries = p->sq_entries; + ctx->cq_entries = p->cq_entries; + size = rings_size(p->sq_entries, p->cq_entries, &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; @@ -7850,8 +7854,6 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; ctx->sq_mask = rings->sq_ring_mask; ctx->cq_mask = rings->cq_ring_mask; - ctx->sq_entries = rings->sq_ring_entries; - ctx->cq_entries = rings->cq_ring_entries; size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); if (size == SIZE_MAX) { -- cgit v1.2.3 From 3c5d00a211282436a31d4665d11e59b45e4425be Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 6 Aug 2020 19:41:50 -0600 Subject: io_uring: use TWA_SIGNAL for task_work uncondtionally commit 0ba9c9edcd152158a0e321a4c13ac1dfc571ff3d upstream. An earlier commit: b7db41c9e03b ("io_uring: fix regression with always ignoring signals in io_cqring_wait()") ensured that we didn't get stuck waiting for eventfd reads when it's registered with the io_uring ring for event notification, but we still have cases where the task can be waiting on other events in the kernel and need a bigger nudge to make forward progress. Or the task could be in the kernel and running, but on its way to blocking. This means that TWA_RESUME cannot reliably be used to ensure we make progress. Use TWA_SIGNAL unconditionally. Cc: stable@vger.kernel.org # v5.7+ Reported-by: Josef Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 98c99e62a628..eb8407b5d15e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4100,22 +4100,22 @@ static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb) { struct task_struct *tsk = req->task; struct io_ring_ctx *ctx = req->ctx; - int ret, notify = TWA_RESUME; + int ret, notify; /* - * SQPOLL kernel thread doesn't need notification, just a wakeup. - * If we're not using an eventfd, then TWA_RESUME is always fine, - * as we won't have dependencies between request completions for - * other kernel wait conditions. + * SQPOLL kernel thread doesn't need notification, just a wakeup. For + * all other cases, use TWA_SIGNAL unconditionally to ensure we're + * processing task_work. There's no reliable way to tell if TWA_RESUME + * will do the job. */ - if (ctx->flags & IORING_SETUP_SQPOLL) - notify = 0; - else if (ctx->cq_ev_fd) + notify = 0; + if (!(ctx->flags & IORING_SETUP_SQPOLL)) notify = TWA_SIGNAL; ret = task_work_add(tsk, cb, notify); if (!ret) wake_up_process(tsk); + return ret; } -- cgit v1.2.3 From fe552346a87b9890fab5aac4a87abd27a11db255 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 09:50:19 -0600 Subject: io_uring: fail poll arm on queue proc failure commit a36da65c46565d2527eec3efdb546251e38253fd upstream. Check the ipt.error value, it must have been either cleared to zero or set to another error than the default -EINVAL if we don't go through the waitqueue proc addition. Just give up on poll at that point and return failure, this will fallback to async work. io_poll_add() doesn't suffer from this failure case, as it returns the error value directly. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+a730016dc0bdce4f6ff5@syzkaller.appspotmail.com Reviewed-by: Stefano Garzarella Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index eb8407b5d15e..0edc56ffd5ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4483,7 +4483,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret) { + if (ret || ipt.error) { io_poll_remove_double(req, apoll->double_poll); spin_unlock_irq(&ctx->completion_lock); if (req->flags & REQ_F_WORK_INITIALIZED) -- cgit v1.2.3 From 4db216cc1d59b445df85b57be025394f8d5cc723 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 15 Aug 2020 11:44:50 -0700 Subject: io_uring: sanitize double poll handling commit d4e7cd36a90e38e0276d6ce0c20f5ccef17ec38c upstream. There's a bit of confusion on the matching pairs of poll vs double poll, depending on if the request is a pure poll (IORING_OP_POLL_ADD) or poll driven retry. Add io_poll_get_double() that returns the double poll waitqueue, if any, and io_poll_get_single() that returns the original poll waitqueue. With that, remove the argument to io_poll_remove_double(). Finally ensure that wait->private is cleared once the double poll handler has run, so that remove knows it's already been seen. Cc: stable@vger.kernel.org # v5.8 Reported-by: syzbot+7f617d4a9369028b8a2c@syzkaller.appspotmail.com Fixes: 18bceab101ad ("io_uring: allow POLL_ADD with double poll_wait() users") Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0edc56ffd5ef..9b2c6ff82f2b 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4172,9 +4172,24 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) return false; } -static void io_poll_remove_double(struct io_kiocb *req, void *data) +static struct io_poll_iocb *io_poll_get_double(struct io_kiocb *req) { - struct io_poll_iocb *poll = data; + /* pure poll stashes this in ->io, poll driven retry elsewhere */ + if (req->opcode == IORING_OP_POLL_ADD) + return (struct io_poll_iocb *) req->io; + return req->apoll->double_poll; +} + +static struct io_poll_iocb *io_poll_get_single(struct io_kiocb *req) +{ + if (req->opcode == IORING_OP_POLL_ADD) + return &req->poll; + return &req->apoll->poll; +} + +static void io_poll_remove_double(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = io_poll_get_double(req); lockdep_assert_held(&req->ctx->completion_lock); @@ -4194,7 +4209,7 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) { struct io_ring_ctx *ctx = req->ctx; - io_poll_remove_double(req, req->io); + io_poll_remove_double(req); req->poll.done = true; io_cqring_fill_event(req, error ? error : mangle_poll(mask)); io_commit_cqring(ctx); @@ -4236,7 +4251,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_kiocb *req = wait->private; - struct io_poll_iocb *poll = req->apoll->double_poll; + struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); /* for instances that support it check for an event match first: */ @@ -4250,6 +4265,8 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); + /* make sure double remove sees this as being gone */ + wait->private = NULL; spin_unlock(&poll->head->lock); if (!done) __io_async_wake(req, poll, mask, io_poll_task_func); @@ -4358,7 +4375,7 @@ static void io_async_task_func(struct callback_head *cb) } } - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); /* restore ->work in case we need to retry again */ @@ -4484,7 +4501,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req) ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); if (ret || ipt.error) { - io_poll_remove_double(req, apoll->double_poll); + io_poll_remove_double(req); spin_unlock_irq(&ctx->completion_lock); if (req->flags & REQ_F_WORK_INITIALIZED) memcpy(&req->work, &apoll->work, sizeof(req->work)); @@ -4518,14 +4535,13 @@ static bool io_poll_remove_one(struct io_kiocb *req) { bool do_complete; + io_poll_remove_double(req); + if (req->opcode == IORING_OP_POLL_ADD) { - io_poll_remove_double(req, req->io); do_complete = __io_poll_remove_one(req, &req->poll); } else { struct async_poll *apoll = req->apoll; - io_poll_remove_double(req, apoll->double_poll); - /* non-poll requests have submit ref still */ do_complete = __io_poll_remove_one(req, &apoll->poll); if (do_complete) { -- cgit v1.2.3 From a4418e11cf9b617b0d433ec9507291dc785d3501 Mon Sep 17 00:00:00 2001 From: Guoyu Huang Date: Wed, 5 Aug 2020 03:53:50 -0700 Subject: io_uring: Fix NULL pointer dereference in loop_rw_iter() commit 2dd2111d0d383df104b144e0d1f6b5a00cb7cd88 upstream. loop_rw_iter() does not check whether the file has a read or write function. This can lead to NULL pointer dereference when the user passes in a file descriptor that does not have read or write function. The crash log looks like this: [ 99.834071] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 99.835364] #PF: supervisor instruction fetch in kernel mode [ 99.836522] #PF: error_code(0x0010) - not-present page [ 99.837771] PGD 8000000079d62067 P4D 8000000079d62067 PUD 79d8c067 PMD 0 [ 99.839649] Oops: 0010 [#2] SMP PTI [ 99.840591] CPU: 1 PID: 333 Comm: io_wqe_worker-0 Tainted: G D 5.8.0 #2 [ 99.842622] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 99.845140] RIP: 0010:0x0 [ 99.845840] Code: Bad RIP value. [ 99.846672] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.848018] RAX: 0000000000000000 RBX: ffff92363bd67300 RCX: ffff92363d461208 [ 99.849854] RDX: 0000000000000010 RSI: 00007ffdbf696bb0 RDI: ffff92363bd67300 [ 99.851743] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.853394] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.855148] R13: 0000000000000000 R14: ffff92363d461208 R15: ffffa1c7c01ebc68 [ 99.856914] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.858651] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.860032] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 [ 99.861979] Call Trace: [ 99.862617] loop_rw_iter.part.0+0xad/0x110 [ 99.863838] io_write+0x2ae/0x380 [ 99.864644] ? kvm_sched_clock_read+0x11/0x20 [ 99.865595] ? sched_clock+0x9/0x10 [ 99.866453] ? sched_clock_cpu+0x11/0xb0 [ 99.867326] ? newidle_balance+0x1d4/0x3c0 [ 99.868283] io_issue_sqe+0xd8f/0x1340 [ 99.869216] ? __switch_to+0x7f/0x450 [ 99.870280] ? __switch_to_asm+0x42/0x70 [ 99.871254] ? __switch_to_asm+0x36/0x70 [ 99.872133] ? lock_timer_base+0x72/0xa0 [ 99.873155] ? switch_mm_irqs_off+0x1bf/0x420 [ 99.874152] io_wq_submit_work+0x64/0x180 [ 99.875192] ? kthread_use_mm+0x71/0x100 [ 99.876132] io_worker_handle_work+0x267/0x440 [ 99.877233] io_wqe_worker+0x297/0x350 [ 99.878145] kthread+0x112/0x150 [ 99.878849] ? __io_worker_unuse+0x100/0x100 [ 99.879935] ? kthread_park+0x90/0x90 [ 99.880874] ret_from_fork+0x22/0x30 [ 99.881679] Modules linked in: [ 99.882493] CR2: 0000000000000000 [ 99.883324] ---[ end trace 4453745f4673190b ]--- [ 99.884289] RIP: 0010:0x0 [ 99.884837] Code: Bad RIP value. [ 99.885492] RSP: 0018:ffffa1c7c01ebc08 EFLAGS: 00010202 [ 99.886851] RAX: 0000000000000000 RBX: ffff92363acd7f00 RCX: ffff92363d461608 [ 99.888561] RDX: 0000000000000010 RSI: 00007ffe040d9e10 RDI: ffff92363acd7f00 [ 99.890203] RBP: ffffa1c7c01ebc40 R08: 0000000000000000 R09: 0000000000000000 [ 99.891907] R10: ffffffff9ec692a0 R11: 0000000000000000 R12: 0000000000000010 [ 99.894106] R13: 0000000000000000 R14: ffff92363d461608 R15: ffffa1c7c01ebc68 [ 99.896079] FS: 0000000000000000(0000) GS:ffff92363dd00000(0000) knlGS:0000000000000000 [ 99.898017] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 99.899197] CR2: ffffffffffffffd6 CR3: 000000007ac66000 CR4: 00000000000006e0 Fixes: 32960613b7c3 ("io_uring: correctly handle non ->{read,write}_iter() file_operations") Cc: stable@vger.kernel.org Signed-off-by: Guoyu Huang Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9b2c6ff82f2b..e42c7583ee5e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2673,8 +2673,10 @@ static int io_read(struct io_kiocb *req, bool force_nonblock) if (req->file->f_op->read_iter) ret2 = call_read_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->read) ret2 = loop_rw_iter(READ, req->file, kiocb, &iter); + else + ret2 = -EINVAL; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -2788,8 +2790,10 @@ static int io_write(struct io_kiocb *req, bool force_nonblock) if (req->file->f_op->write_iter) ret2 = call_write_iter(req->file, kiocb, &iter); - else + else if (req->file->f_op->write) ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter); + else + ret2 = -EINVAL; if (!force_nonblock) current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; -- cgit v1.2.3 From 960ea38bfe7a0edd403d6a9e2d358018b4ce3bfa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Aug 2020 08:04:14 -0600 Subject: io_uring: hold 'ctx' reference around task_work queue + execute commit 6d816e088c359866f9867057e04f244c608c42fe upstream. We're holding the request reference, but we need to go one higher to ensure that the ctx remains valid after the request has finished. If the ring is closed with pending task_work inflight, and the given io_kiocb finishes sync during issue, then we need a reference to the ring itself around the task_work execution cycle. Cc: stable@vger.kernel.org # v5.7+ Reported-by: syzbot+9b260fc33297966f5a8e@syzkaller.appspotmail.com Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index e42c7583ee5e..0918fae6270c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4140,6 +4140,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll, tsk = req->task; req->result = mask; init_task_work(&req->task_work, func); + percpu_ref_get(&req->ctx->refs); + /* * If this fails, then the task is exiting. When a task exits, the * work gets canceled, so just cancel this request as well instead @@ -4239,6 +4241,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt) static void io_poll_task_func(struct callback_head *cb) { struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); + struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt = NULL; io_poll_task_handler(req, &nxt); @@ -4249,6 +4252,7 @@ static void io_poll_task_func(struct callback_head *cb) __io_queue_sqe(nxt, NULL); mutex_unlock(&ctx->uring_lock); } + percpu_ref_put(&ctx->refs); } static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, @@ -4365,6 +4369,7 @@ static void io_async_task_func(struct callback_head *cb) if (io_poll_rewait(req, &apoll->poll)) { spin_unlock_irq(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); return; } @@ -4402,6 +4407,7 @@ end_req: req_set_fail_links(req); io_double_put_req(req); } + percpu_ref_put(&ctx->refs); } static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync, -- cgit v1.2.3 From d181cfc89885486af0cad9e89e3fec29eb333957 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Aug 2020 10:54:02 -0600 Subject: io_uring: add missing REQ_F_COMP_LOCKED for nested requests commit 9b7adba9eaec28e0e4343c96d0dbeb9578802f5f upstream. When we traverse into failing links or timeouts, we need to ensure we propagate the REQ_F_COMP_LOCKED flag to ensure that we correctly signal to the completion side that we already hold the completion lock. Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 0918fae6270c..6461b3cd5fc0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1549,12 +1549,9 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) /* * Called if REQ_F_LINK_HEAD is set, and we fail the head request */ -static void io_fail_links(struct io_kiocb *req) +static void __io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - - spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { struct io_kiocb *link = list_first_entry(&req->link_list, @@ -1568,13 +1565,29 @@ static void io_fail_links(struct io_kiocb *req) io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); + link->flags |= REQ_F_COMP_LOCKED; __io_double_put_req(link); } req->flags &= ~REQ_F_LINK_TIMEOUT; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); +} + +static void io_fail_links(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + if (!(req->flags & REQ_F_COMP_LOCKED)) { + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + __io_fail_links(req); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + __io_fail_links(req); + } + io_cqring_ev_posted(ctx); } @@ -4767,6 +4780,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) return -EALREADY; req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; -- cgit v1.2.3 From df18711f9c002335f278628206ffc65af6fdf1f6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 12 Aug 2020 17:33:30 -0600 Subject: io_uring: enable lookup of links holding inflight files commit f254ac04c8744cf7bfed012717eac34eacc65dfb upstream. When a process exits, we cancel whatever requests it has pending that are referencing the file table. However, if a link is holding a reference, then we cannot find it by simply looking at the inflight list. Enable checking of the poll and timeout list to find the link, and cancel it appropriately. Cc: stable@vger.kernel.org Reported-by: Josef Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 10 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 6461b3cd5fc0..f926d94867f7 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4586,6 +4586,7 @@ static bool io_poll_remove_one(struct io_kiocb *req) io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(req->ctx); req->flags |= REQ_F_COMP_LOCKED; + req_set_fail_links(req); io_put_req(req); } @@ -4759,6 +4760,23 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +static int __io_timeout_cancel(struct io_kiocb *req) +{ + int ret; + + list_del_init(&req->list); + + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); + if (ret == -1) + return -EALREADY; + + req_set_fail_links(req); + req->flags |= REQ_F_COMP_LOCKED; + io_cqring_fill_event(req, -ECANCELED); + io_put_req(req); + return 0; +} + static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) { struct io_kiocb *req; @@ -4766,7 +4784,6 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) list_for_each_entry(req, &ctx->timeout_list, list) { if (user_data == req->user_data) { - list_del_init(&req->list); ret = 0; break; } @@ -4775,15 +4792,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->io->timeout.timer); - if (ret == -1) - return -EALREADY; - - req_set_fail_links(req); - req->flags |= REQ_F_COMP_LOCKED; - io_cqring_fill_event(req, -ECANCELED); - io_put_req(req); - return 0; + return __io_timeout_cancel(req); } static int io_timeout_remove_prep(struct io_kiocb *req, @@ -7535,6 +7544,71 @@ static bool io_wq_files_match(struct io_wq_work *work, void *data) return work->files == files; } +/* + * Returns true if 'preq' is the link parent of 'req' + */ +static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(preq->flags & REQ_F_LINK_HEAD)) + return false; + + list_for_each_entry(link, &preq->link_list, link_list) { + if (link == req) + return true; + } + + return false; +} + +/* + * We're looking to cancel 'req' because it's holding on to our files, but + * 'req' could be a link to another request. See if it is, and cancel that + * parent request if so. + */ +static bool io_poll_remove_link(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hlist_node *tmp; + struct io_kiocb *preq; + bool found = false; + int i; + + spin_lock_irq(&ctx->completion_lock); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(preq, tmp, list, hash_node) { + found = io_match_link(preq, req); + if (found) { + io_poll_remove_one(preq); + break; + } + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + +static bool io_timeout_remove_link(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + struct io_kiocb *preq; + bool found = false; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry(preq, &ctx->timeout_list, list) { + found = io_match_link(preq, req); + if (found) { + __io_timeout_cancel(preq); + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + return found; +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -7592,6 +7666,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, } } else { io_wq_cancel_work(ctx->io_wq, &cancel_req->work); + /* could be a link, check and remove if it is */ + if (!io_poll_remove_link(ctx, cancel_req)) + io_timeout_remove_link(ctx, cancel_req); io_put_req(cancel_req); } -- cgit v1.2.3 From 917cdc0d2c53637e3cce1a216bfa4e0477c7851b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 16 Aug 2020 08:23:05 -0700 Subject: io_uring: find and cancel head link async work on files exit commit b711d4eaf0c408a811311ee3e94d6e9e5a230a9a upstream. Commit f254ac04c874 ("io_uring: enable lookup of links holding inflight files") only handled 2 out of the three head link cases we have, we also need to lookup and cancel work that is blocked in io-wq if that work has a link that's holding a reference to the files structure. Put the "cancel head links that hold this request pending" logic into io_attempt_cancel(), which will to through the motions of finding and canceling head links that hold the current inflight files stable request pending. Cc: stable@vger.kernel.org Reported-by: Pavel Begunkov Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index f926d94867f7..dd8ad87540ef 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7609,6 +7609,33 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, return found; } +static bool io_cancel_link_cb(struct io_wq_work *work, void *data) +{ + return io_match_link(container_of(work, struct io_kiocb, work), data); +} + +static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + enum io_wq_cancel cret; + + /* cancel this particular work, if it's running */ + cret = io_wq_cancel_work(ctx->io_wq, &req->work); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* find links that hold this pending, cancel those */ + cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_link_cb, req, true); + if (cret != IO_WQ_CANCEL_NOTFOUND) + return; + + /* if we have a poll link holding this pending, cancel that */ + if (io_poll_remove_link(ctx, req)) + return; + + /* final option, timeout link is holding this req pending */ + io_timeout_remove_link(ctx, req); +} + static void io_uring_cancel_files(struct io_ring_ctx *ctx, struct files_struct *files) { @@ -7665,10 +7692,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx, continue; } } else { - io_wq_cancel_work(ctx->io_wq, &cancel_req->work); - /* could be a link, check and remove if it is */ - if (!io_poll_remove_link(ctx, cancel_req)) - io_timeout_remove_link(ctx, cancel_req); + /* cancel this request, or head link requests */ + io_attempt_cancel(ctx, cancel_req); io_put_req(cancel_req); } -- cgit v1.2.3 From 44f639d5aeda30318fc86aec2fe99581fc24a134 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Aug 2020 16:42:35 -0600 Subject: io_uring: fix missing ->mm on exit Upstream commits: 8eb06d7e8dd85 ("io_uring: fix missing ->mm on exit") cbcf72148da4a ("io_uring: return locked and pinned page accounting") do_exit() first drops current->mm and then runs task_work, from where io_sq_thread_acquire_mm() would try to set mm for a user dying process. [ 208.004249] WARNING: CPU: 2 PID: 1854 at kernel/kthread.c:1238 kthread_use_mm+0x244/0x270 [ 208.004287] kthread_use_mm+0x244/0x270 [ 208.004288] io_sq_thread_acquire_mm.part.0+0x54/0x80 [ 208.004290] io_async_task_func+0x258/0x2ac [ 208.004291] task_work_run+0xc8/0x210 [ 208.004294] do_exit+0x1b8/0x430 [ 208.004295] do_group_exit+0x44/0xac [ 208.004296] get_signal+0x164/0x69c [ 208.004298] do_signal+0x94/0x1d0 [ 208.004299] do_notify_resume+0x18c/0x340 [ 208.004300] work_pending+0x8/0x3d4 Reported-by: Roman Gershman Tested-by: Roman Gershman Signed-off-by: Pavel Begunkov Signed-off-by: Greg Kroah-Hartman --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index dd8ad87540ef..26978630378e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4363,7 +4363,8 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, struct io_kiocb *req) { if (io_op_defs[req->opcode].needs_mm && !current->mm) { - if (unlikely(!mmget_not_zero(ctx->sqo_mm))) + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || + !mmget_not_zero(ctx->sqo_mm))) return -EFAULT; kthread_use_mm(ctx->sqo_mm); } -- cgit v1.2.3