From 2840f710f23a3a867426637393acbdfa1f4f1d59 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:51 +0100 Subject: io_uring: fix drain with rsrc CQEs Resource emitted CQEs are not bound to requests, so fix up counters used for DRAIN/defer logic. Fixes: b60c8dce33895 ("io_uring: preparation for rsrc tagging") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/2b32f5f0a40d5928c3466d028f936e167f0654be.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 360f81395d81..b0706f047c50 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7539,6 +7539,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_ring_submit_lock(ctx, lock_ring); spin_lock_irqsave(&ctx->completion_lock, flags); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); + ctx->cq_extra++; io_commit_cqring(ctx); spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); -- cgit v1.2.3 From dddca22636c9062f284e755e2a49fb8863db8a82 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:52 +0100 Subject: io_uring: dont overlap internal and user req flags CQE flags take one byte that we store in req->flags together with other REQ_F_* internal flags. CQE flags are copied directly into req and then verified that requires some handling on failures, e.g. to make sure that that copy doesn't set some of the internal flags. Move all internal flags to take bits after the first byte, so we don't need extra handling and make it safer overall. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b8b5b02d1ab9d786fcc7db4a3fe86db6b70b8987.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b0706f047c50..2872f1db6458 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -702,7 +702,8 @@ enum { REQ_F_FORCE_ASYNC_BIT = IOSQE_ASYNC_BIT, REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT, - REQ_F_FAIL_LINK_BIT, + /* first byte is taken by user flags, shift it to not overlap */ + REQ_F_FAIL_LINK_BIT = 8, REQ_F_INFLIGHT_BIT, REQ_F_CUR_POS_BIT, REQ_F_NOWAIT_BIT, @@ -6503,14 +6504,10 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->work.creds = NULL; /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { - req->flags = 0; + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) return -EINVAL; - } - if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) return -EACCES; -- cgit v1.2.3 From b0d658ec88a695861c3fd78ef783c1181f81a6e2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 27 Apr 2021 16:13:53 +0100 Subject: io_uring: add more build check for uapi Add a couple of BUILD_BUG_ON() checking some rsrc uapi structs and SQE flags. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/ff960df4d5026b9fb5bfd80994b9d3667d3926da.1619536280.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2872f1db6458..aa5854cd3942 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10134,6 +10134,13 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_ON(sizeof(struct io_uring_files_update) != + sizeof(struct io_uring_rsrc_update)); + BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > + sizeof(struct io_uring_rsrc_update2)); + /* should fit into one byte */ + BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); + BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | -- cgit v1.2.3 From 6224843d56e0c29c0357e86b02b95801897c2caf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 28 Apr 2021 13:11:29 +0100 Subject: io_uring: allow empty slots for reg buffers Allow empty reg buffer slots any request using which should fail. This allows users to not register all buffers in advance, but do it lazily and/or on demand via updates. That is achieved by setting iov_base and iov_len to zero for registration and/or buffer updates. Empty buffer can't have a non-zero tag. Implementation details: to not add extra overhead to io_import_fixed(), create a dummy buffer crafted to fail any request using it, and set it to all empty buffer slots. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7e95e4d700082baaf010c648c72ac764c9cc8826.1619611868.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 36 +++++++++++++++++++++++++++++------- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index aa5854cd3942..47c2f126f885 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -456,6 +456,7 @@ struct io_ring_ctx { spinlock_t rsrc_ref_lock; struct io_rsrc_node *rsrc_node; struct io_rsrc_node *rsrc_backup_node; + struct io_mapped_ubuf *dummy_ubuf; struct io_restriction restrictions; @@ -1158,6 +1159,12 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) goto err; __hash_init(ctx->cancel_hash, 1U << hash_bits); + ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL); + if (!ctx->dummy_ubuf) + goto err; + /* set invalid range, so io_import_fixed() fails meeting it */ + ctx->dummy_ubuf->ubuf = -1UL; + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -1185,6 +1192,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list); return ctx; err: + kfree(ctx->dummy_ubuf); kfree(ctx->cancel_hash); kfree(ctx); return NULL; @@ -8109,11 +8117,13 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo struct io_mapped_ubuf *imu = *slot; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); + if (imu != ctx->dummy_ubuf) { + for (i = 0; i < imu->nr_bvecs; i++) + unpin_user_page(imu->bvec[i].bv_page); + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + kvfree(imu); + } *slot = NULL; } @@ -8253,6 +8263,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size_t size; int ret, pret, nr_pages, i; + if (!iov->iov_base) { + *pimu = ctx->dummy_ubuf; + return 0; + } + ubuf = (unsigned long) iov->iov_base; end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; @@ -8350,7 +8365,9 @@ static int io_buffer_validate(struct iovec *iov) * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ - if (!iov->iov_base || !iov->iov_len) + if (!iov->iov_base) + return iov->iov_len ? -EFAULT : 0; + if (!iov->iov_len) return -EFAULT; /* arbitrary limit, but we need something */ @@ -8400,6 +8417,8 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(&iov); if (ret) break; + if (!iov.iov_base && tag) + return -EINVAL; ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); @@ -8449,12 +8468,14 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(&iov); if (err) break; + if (!iov.iov_base && tag) + return -EINVAL; err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; i = array_index_nospec(offset, ctx->nr_user_bufs); - if (ctx->user_bufs[i]) { + if (ctx->user_bufs[i] != ctx->dummy_ubuf) { err = io_queue_rsrc_removal(ctx->buf_data, offset, ctx->rsrc_node, ctx->user_bufs[i]); if (unlikely(err)) { @@ -8602,6 +8623,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); + kfree(ctx->dummy_ubuf); kfree(ctx); } -- cgit v1.2.3 From 47b228ce6f66830768eac145efa7746637969101 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 29 Apr 2021 11:46:48 +0100 Subject: io_uring: fix unchecked error in switch_start() io_rsrc_node_switch_start() can fail, don't forget to check returned error code. Reported-by: syzbot+a4715dd4b7c866136f79@syzkaller.appspotmail.com Fixes: eae071c9b4cef ("io_uring: prepare fixed rw for dynanic buffers") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/c4c06e2f3f0c8e43bd8d0a266c79055bcc6b6e60.1619693112.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 47c2f126f885..82117e6bf45d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9627,7 +9627,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; /* always set a rsrc node */ - io_rsrc_node_switch_start(ctx); + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; io_rsrc_node_switch(ctx, NULL); memset(&p->sq_off, 0, sizeof(p->sq_off)); -- cgit v1.2.3 From cf3770e78421f268dee3c1eef5e8a5d284ec3416 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Apr 2021 11:46:02 +0100 Subject: io_uring: Fix premature return from loop and memory leak Currently the -EINVAL error return path is leaking memory allocated to data. Fix this by not returning immediately but instead setting the error return variable to -EINVAL and breaking out of the loop. Kudos to Pavel Begunkov for suggesting a correct fix. Signed-off-by: Colin Ian King Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210429104602.62676-1-colin.king@canonical.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 82117e6bf45d..a880edb90d0c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8417,8 +8417,10 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, ret = io_buffer_validate(&iov); if (ret) break; - if (!iov.iov_base && tag) - return -EINVAL; + if (!iov.iov_base && tag) { + ret = -EINVAL; + break; + } ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i], &last_hpage); @@ -8468,8 +8470,10 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(&iov); if (err) break; - if (!iov.iov_base && tag) - return -EINVAL; + if (!iov.iov_base && tag) { + err = -EINVAL; + break; + } err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage); if (err) break; -- cgit v1.2.3 From bb6659cc0ad3c2afc3801b708b19c4c67e55ddf2 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 30 Apr 2021 16:25:15 +0800 Subject: io_uring: Fix memory leak in io_sqe_buffers_register() unreferenced object 0xffff8881123bf0a0 (size 32): comm "syz-executor557", pid 8384, jiffies 4294946143 (age 12.360s) backtrace: [] kmalloc_node include/linux/slab.h:579 [inline] [] kvmalloc_node+0x61/0xf0 mm/util.c:587 [] kvmalloc include/linux/mm.h:795 [inline] [] kvmalloc_array include/linux/mm.h:813 [inline] [] kvcalloc include/linux/mm.h:818 [inline] [] io_rsrc_data_alloc+0x4f/0xc0 fs/io_uring.c:7164 [] io_sqe_buffers_register+0x98/0x3d0 fs/io_uring.c:8383 [] __io_uring_register+0xf67/0x18c0 fs/io_uring.c:9986 [] __do_sys_io_uring_register fs/io_uring.c:10091 [inline] [] __se_sys_io_uring_register fs/io_uring.c:10071 [inline] [] __x64_sys_io_uring_register+0x112/0x230 fs/io_uring.c:10071 [] do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 [] entry_SYSCALL_64_after_hwframe+0x44/0xae Fix data->tags memory leak, through io_rsrc_data_free() to release data memory space. Reported-by: syzbot+0f32d05d8b6cd8d7ea3e@syzkaller.appspotmail.com Signed-off-by: Zqiang Link: https://lore.kernel.org/r/20210430082515.13886-1-qiang.zhang@windriver.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index a880edb90d0c..7a2e83bc005d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8140,7 +8140,7 @@ static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_user_bufs; i++) io_buffer_unmap(ctx, &ctx->user_bufs[i]); kfree(ctx->user_bufs); - kfree(ctx->buf_data); + io_rsrc_data_free(ctx->buf_data); ctx->user_bufs = NULL; ctx->buf_data = NULL; ctx->nr_user_bufs = 0; @@ -8400,7 +8400,7 @@ static int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return -ENOMEM; ret = io_buffers_map_alloc(ctx, nr_args); if (ret) { - kfree(data); + io_rsrc_data_free(data); return ret; } -- cgit v1.2.3 From d1f82808877bb10d3deee7cf3374a4eb3fb582db Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Wed, 5 May 2021 09:47:06 -0300 Subject: io_uring: truncate lengths larger than MAX_RW_COUNT on provide buffers Read and write operations are capped to MAX_RW_COUNT. Some read ops rely on that limit, and that is not guaranteed by the IORING_OP_PROVIDE_BUFFERS. Truncate those lengths when doing io_add_buffers, so buffer addresses still use the uncapped length. Also, take the chance and change struct io_buffer len member to __u32, so it matches struct io_provide_buffer len member. This fixes CVE-2021-3491, also reported as ZDI-CAN-13546. Fixes: ddf0322db79c ("io_uring: add IORING_OP_PROVIDE_BUFFERS") Reported-by: Billy Jheng Bing-Jhong (@st424204) Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Jens Axboe --- fs/io_uring.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7a2e83bc005d..f46acbbeed57 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -251,7 +251,7 @@ struct io_rsrc_data { struct io_buffer { struct list_head list; __u64 addr; - __s32 len; + __u32 len; __u16 bid; }; @@ -3986,7 +3986,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) break; buf->addr = addr; - buf->len = pbuf->len; + buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; addr += pbuf->len; bid++; -- cgit v1.2.3 From a5e7da1494e191c561ecce8829a6c19449585e3d Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 5 May 2021 07:37:28 +0200 Subject: MAINTAINERS: add io_uring tool to IO_URING The files in ./tools/io_uring/ are maintained by the IO_URING maintainers. Reflect that fact in MAINTAINERS. Signed-off-by: Lukas Bulwahn Link: https://lore.kernel.org/r/20210505053728.3868-1-lukas.bulwahn@gmail.com Signed-off-by: Jens Axboe --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5c90148f0369..b680192d18b9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9497,6 +9497,7 @@ F: fs/io-wq.h F: fs/io_uring.c F: include/linux/io_uring.h F: include/uapi/linux/io_uring.h +F: tools/io_uring/ IPMI SUBSYSTEM M: Corey Minyard -- cgit v1.2.3 From 50b7b6f29de3e18e9d6c09641256a0296361cfee Mon Sep 17 00:00:00 2001 From: Stefan Metzmacher Date: Wed, 5 May 2021 13:03:10 +0200 Subject: x86/process: setup io_threads more like normal user space threads As io_threads are fully set up USER threads it's clearer to separate the code path from the KTHREAD logic. The only remaining difference to user space threads is that io_threads never return to user space again. Instead they loop within the given worker function. The fact that they never return to user space means they don't have an user space thread stack. In order to indicate that to tools like gdb we reset the stack and instruction pointers to 0. This allows gdb attach to user space processes using io-uring, which like means that they have io_threads, without printing worrying message like this: warning: Selected architecture i386:x86-64 is not compatible with reported target architecture i386 warning: Architecture rejected target-supplied description The output will be something like this: (gdb) info threads Id Target Id Frame * 1 LWP 4863 "io_uring-cp-for" syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38 2 LWP 4864 "iou-mgr-4863" 0x0000000000000000 in ?? () 3 LWP 4865 "iou-wrk-4863" 0x0000000000000000 in ?? () (gdb) thread 3 [Switching to thread 3 (LWP 4865)] #0 0x0000000000000000 in ?? () (gdb) bt #0 0x0000000000000000 in ?? () Backtrace stopped: Cannot access memory at address 0x0 Fixes: 4727dc20e042 ("arch: setup PF_IO_WORKER threads like PF_KTHREAD") Link: https://lore.kernel.org/io-uring/044d0bad-6888-a211-e1d3-159a4aeed52d@polymtl.ca/T/#m1bbf5727e3d4e839603f6ec7ed79c7eebfba6267 Signed-off-by: Stefan Metzmacher cc: Linus Torvalds cc: Jens Axboe cc: Andy Lutomirski cc: linux-kernel@vger.kernel.org cc: io-uring@vger.kernel.org cc: x86@kernel.org Link: https://lore.kernel.org/r/20210505110310.237537-1-metze@samba.org Reviewed-by: Thomas Gleixner Signed-off-by: Jens Axboe --- arch/x86/kernel/process.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 43cbfc84153a..5e1f38179f49 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, #endif /* Kernel thread ? */ - if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { + if (unlikely(p->flags & PF_KTHREAD)) { memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; @@ -172,6 +172,23 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, task_user_gs(p) = get_user_gs(current_pt_regs()); #endif + if (unlikely(p->flags & PF_IO_WORKER)) { + /* + * An IO thread is a user space thread, but it doesn't + * return to ret_after_fork(). + * + * In order to indicate that to tools like gdb, + * we reset the stack and instruction pointers. + * + * It does the same kernel frame setup to return to a kernel + * function that a kernel thread does. + */ + childregs->sp = 0; + childregs->ip = 0; + kthread_frame_init(frame, sp, arg); + return 0; + } + /* Set a new TLS for the child thread? */ if (clone_flags & CLONE_SETTLS) ret = set_new_tls(p, tls); -- cgit v1.2.3