From 03d89a2de25bbc5c77e61a0cf77663978c4b6ea7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 5 Nov 2021 17:20:54 -0600 Subject: io_uring: support for user allocated memory for rings/sqes Currently io_uring applications must call mmap(2) twice to map the rings themselves, and the sqes array. This works fine, but it does not support using huge pages to back the rings/sqes. Provide a way for the application to pass in pre-allocated memory for the rings/sqes, which can then suitably be allocated from shmfs or via mmap to get huge page support. Particularly for larger rings, this reduces the TLBs needed. If an application wishes to take advantage of that, it must pre-allocate the memory needed for the sq/cq ring, and the sqes. The former must be passed in via the io_uring_params->cq_off.user_data field, while the latter is passed in via the io_uring_params->sq_off.user_data field. Then it must set IORING_SETUP_NO_MMAP in the io_uring_params->flags field, and io_uring will then map the existing memory into the kernel for shared use. The application must not call mmap(2) to map rings as it otherwise would have, that will now fail with -EINVAL if this setup flag was used. The pages used for the rings and sqes must be contigious. The intent here is clearly that huge pages should be used, otherwise the normal setup procedure works fine as-is. The application may use one huge page for both the rings and sqes. Outside of those initialization changes, everything works like it did before. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 10 ++++++++++ include/uapi/linux/io_uring.h | 9 +++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 1b2a20a42413..f04ce513fadb 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -211,6 +211,16 @@ struct io_ring_ctx { unsigned int compat: 1; enum task_work_notify_mode notify_method; + + /* + * If IORING_SETUP_NO_MMAP is used, then the below holds + * the gup'ed pages for the two rings, and the sqes. + */ + unsigned short n_ring_pages; + unsigned short n_sqe_pages; + struct page **ring_pages; + struct page **sqe_pages; + struct io_rings *rings; struct task_struct *submitter_task; struct percpu_ref refs; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0716cb17e436..2edba9a274de 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -173,6 +173,11 @@ enum { */ #define IORING_SETUP_DEFER_TASKRUN (1U << 13) +/* + * Application provides the memory for the rings + */ +#define IORING_SETUP_NO_MMAP (1U << 14) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, @@ -406,7 +411,7 @@ struct io_sqring_offsets { __u32 dropped; __u32 array; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* @@ -425,7 +430,7 @@ struct io_cqring_offsets { __u32 cqes; __u32 flags; __u32 resv1; - __u64 resv2; + __u64 user_addr; }; /* -- cgit v1.2.3 From 6e76ac595855db27bbdaef337173294a6fd6eb2c Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Sat, 29 Apr 2023 01:40:30 +0900 Subject: io_uring: Add io_uring_setup flag to pre-register ring fd and never install it With IORING_REGISTER_USE_REGISTERED_RING, an application can register the ring fd and use it via registered index rather than installed fd. This allows using a registered ring for everything *except* the initial mmap. With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the user, rather than requiring a subsequent mmap. The combination of the two allows a user to operate *entirely* via a registered ring fd, making it unnecessary to ever install the fd in the first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make io_uring_setup register the fd and return a registered index, without installing the fd. This allows an application to avoid touching the fd table at all, and allows a library to never even momentarily install a file descriptor. This splits out an io_ring_add_registered_file helper from io_ring_add_registered_fd, for use by io_uring_setup. Signed-off-by: Josh Triplett Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 7 +++++++ io_uring/io_uring.c | 37 ++++++++++++++++++++++--------------- io_uring/io_uring.h | 3 +++ io_uring/tctx.c | 31 ++++++++++++++++++++----------- 4 files changed, 52 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 2edba9a274de..f222d263bc55 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -178,6 +178,13 @@ enum { */ #define IORING_SETUP_NO_MMAP (1U << 14) +/* + * Register the ring fd in itself for use with + * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather + * than an fd. + */ +#define IORING_SETUP_REGISTERED_FD_ONLY (1U << 15) + enum io_uring_op { IORING_OP_NOP, IORING_OP_READV, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 61379cf8e7f5..dab09f568294 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return 0; } -static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file) +static int io_uring_install_fd(struct file *file) { - int ret, fd; + int fd; fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); if (fd < 0) return fd; - - ret = __io_uring_add_tctx_node(ctx); - if (ret) { - put_unused_fd(fd); - return ret; - } fd_install(fd, file); return fd; } @@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { struct io_ring_ctx *ctx; + struct io_uring_task *tctx; struct file *file; int ret; @@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } + if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + && !(p->flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, goto err; } + ret = __io_uring_add_tctx_node(ctx); + if (ret) + goto err_fput; + tctx = current->io_uring; + /* * Install ring fd as the very last thing, so we don't risk someone * having closed it before we finish setup */ - ret = io_uring_install_fd(ctx, file); - if (ret < 0) { - /* fput will clean it up */ - fput(file); - return ret; - } + if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY) + ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX); + else + ret = io_uring_install_fd(file); + if (ret < 0) + goto err_fput; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: io_ring_ctx_wait_and_kill(ctx); return ret; +err_fput: + fput(file); + return ret; } /* @@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG | IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | - IORING_SETUP_NO_MMAP)) + IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY)) return -EINVAL; return io_uring_create(entries, &p, params); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 259bf798a390..9b8dfb3bb2b4 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, + int start, int end); + int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 3a8d1dd97e1b..c043fe93a3f2 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -208,31 +208,40 @@ void io_uring_unreg_ringfd(void) } } -static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, +int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end) { - struct file *file; int offset; - for (offset = start; offset < end; offset++) { offset = array_index_nospec(offset, IO_RINGFD_REG_MAX); if (tctx->registered_rings[offset]) continue; - file = fget(fd); - if (!file) { - return -EBADF; - } else if (!io_is_uring_fops(file)) { - fput(file); - return -EOPNOTSUPP; - } tctx->registered_rings[offset] = file; return offset; } - return -EBUSY; } +static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, + int start, int end) +{ + struct file *file; + int offset; + + file = fget(fd); + if (!file) { + return -EBADF; + } else if (!io_is_uring_fops(file)) { + fput(file); + return -EOPNOTSUPP; + } + offset = io_ring_add_registered_file(tctx, file, start, end); + if (offset < 0) + fput(file); + return offset; +} + /* * Register a ring fd to avoid fdget/fdput for each io_uring_enter() * invocation. User passes in an array of struct io_uring_rsrc_update -- cgit v1.2.3 From 5f3139fc46993b2d653a7aa5cdfe66a91881fd06 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 15 May 2023 13:54:42 +0100 Subject: io_uring/cmd: add cmd lazy tw wake helper We want to use IOU_F_TWQ_LAZY_WAKE in commands. First, introduce a new cmd tw helper accepting TWQ flags, and then add io_uring_cmd_do_in_task_laz() that will pass IOU_F_TWQ_LAZY_WAKE and imply the "lazy" semantics, i.e. it posts no more than 1 CQE and delaying execution of this tw should not prevent forward progress. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5b9f6716006df7e817f18bd555aee2f8f9c8b0c3.1684154817.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 18 ++++++++++++++++-- io_uring/uring_cmd.c | 16 ++++++++++++---- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 7fe31b2cd02f..bb9c666bd584 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -46,13 +46,23 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, struct iov_iter *iter, void *ioucmd); void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, unsigned issue_flags); -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} static inline void io_uring_files_cancel(void) { @@ -85,6 +95,10 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned)) { } +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 5e32db48696d..476c7877ce58 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -20,16 +20,24 @@ static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) ioucmd->task_work_cb(ioucmd, issue_flags); } -void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; - io_req_task_work_add(req); + __io_req_task_work_add(req, flags); +} +EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); + +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); } -EXPORT_SYMBOL_GPL(io_uring_cmd_complete_in_task); +EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) -- cgit v1.2.3