From b66509b8497f2b002a2654e386a440f1274ddcc7 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:35 +0000 Subject: io_uring: split out cmd api into a separate header linux/io_uring.h is slowly becoming a rubbish bin where we put anything exposed to other subsystems. For instance, the task exit hooks and io_uring cmd infra are completely orthogonal and don't need each other's definitions. Start cleaning it up by splitting out all command bits into a new header file. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/7ec50bae6e21f371d3850796e716917fc141225a.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 89 +----------------------------------------- include/linux/io_uring/cmd.h | 81 ++++++++++++++++++++++++++++++++++++++ include/linux/io_uring_types.h | 20 ++++++++++ 3 files changed, 102 insertions(+), 88 deletions(-) create mode 100644 include/linux/io_uring/cmd.h (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index aefb73eeeebf..d8fc93492dc5 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -6,71 +6,13 @@ #include #include -enum io_uring_cmd_flags { - IO_URING_F_COMPLETE_DEFER = 1, - IO_URING_F_UNLOCKED = 2, - /* the request is executed from poll, it should not be freed */ - IO_URING_F_MULTISHOT = 4, - /* executed by io-wq */ - IO_URING_F_IOWQ = 8, - /* int's last bit, sign checks are usually faster than a bit test */ - IO_URING_F_NONBLOCK = INT_MIN, - - /* ctx state flags, for URING_CMD */ - IO_URING_F_SQE128 = (1 << 8), - IO_URING_F_CQE32 = (1 << 9), - IO_URING_F_IOPOLL = (1 << 10), - - /* set when uring wants to cancel a previously issued command */ - IO_URING_F_CANCEL = (1 << 11), - IO_URING_F_COMPAT = (1 << 12), -}; - -/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ -#define IORING_URING_CMD_CANCELABLE (1U << 30) -#define IORING_URING_CMD_POLLED (1U << 31) - -struct io_uring_cmd { - struct file *file; - const struct io_uring_sqe *sqe; - union { - /* callback to defer completions to task context */ - void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); - /* used for polled completion */ - void *cookie; - }; - u32 cmd_op; - u32 flags; - u8 pdu[32]; /* available inline for free use */ -}; - -static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) -{ - return sqe->cmd; -} - #if defined(CONFIG_IO_URING) -int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); -void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, - unsigned issue_flags); struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); -void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned), - unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); static inline void io_uring_files_cancel(void) { @@ -89,28 +31,7 @@ static inline void io_uring_free(struct task_struct *tsk) if (tsk->io_uring) __io_uring_free(tsk); } -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); -void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) -{ - return -EOPNOTSUPP; -} -static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, - ssize_t ret2, unsigned issue_flags) -{ -} -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} static inline struct sock *io_uring_get_socket(struct file *file) { return NULL; @@ -133,14 +54,6 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } -static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ -} -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return NULL; -} #endif #endif diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h new file mode 100644 index 000000000000..62fcfaf6fcc9 --- /dev/null +++ b/include/linux/io_uring/cmd.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_IO_URING_CMD_H +#define _LINUX_IO_URING_CMD_H + +#include +#include + +/* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ +#define IORING_URING_CMD_CANCELABLE (1U << 30) +#define IORING_URING_CMD_POLLED (1U << 31) + +struct io_uring_cmd { + struct file *file; + const struct io_uring_sqe *sqe; + union { + /* callback to defer completions to task context */ + void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned); + /* used for polled completion */ + void *cookie; + }; + u32 cmd_op; + u32 flags; + u8 pdu[32]; /* available inline for free use */ +}; + +static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) +{ + return sqe->cmd; +} + +#if defined(CONFIG_IO_URING) +int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd); +void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, + unsigned issue_flags); +void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags); +/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ +void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)); + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + +void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags); +struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); + +#else +static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, void *ioucmd) +{ + return -EOPNOTSUPP; +} +static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, + ssize_t ret2, unsigned issue_flags) +{ +} +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ +} +static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ +} +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return NULL; +} +#endif + +#endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 805bb635cdf5..8c807bcc8b2b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,26 @@ #include #include +enum io_uring_cmd_flags { + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* the request is executed from poll, it should not be freed */ + IO_URING_F_MULTISHOT = 4, + /* executed by io-wq */ + IO_URING_F_IOWQ = 8, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, + + /* ctx state flags, for URING_CMD */ + IO_URING_F_SQE128 = (1 << 8), + IO_URING_F_CQE32 = (1 << 9), + IO_URING_F_IOPOLL = (1 << 10), + + /* set when uring wants to cancel a previously issued command */ + IO_URING_F_CANCEL = (1 << 11), + IO_URING_F_COMPAT = (1 << 12), +}; + struct io_wq_work_node { struct io_wq_work_node *next; }; -- cgit v1.2.3 From 6b04a3737057ddfed396c954f9e4be4fe6d53c62 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:36 +0000 Subject: io_uring/cmd: inline io_uring_cmd_do_in_task_lazy Now as we can easily include io_uring_types.h, move IOU_F_TWQ_LAZY_WAKE and inline io_uring_cmd_do_in_task_lazy(). Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/2ec9fb31dd192d1c5cf26d0a2dec5657d88a8e48.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 31 ++++++++++++++++--------------- include/linux/io_uring_types.h | 11 +++++++++++ io_uring/io_uring.h | 10 ---------- io_uring/uring_cmd.c | 7 ------- 4 files changed, 27 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 62fcfaf6fcc9..ee9b3bc3a4af 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -36,15 +36,6 @@ void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2, void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void (*task_work_cb)(struct io_uring_cmd *, unsigned), unsigned flags); -/* users should follow semantics of IOU_F_TWQ_LAZY_WAKE */ -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)); - -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); -} void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); @@ -60,12 +51,9 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t ret2, unsigned issue_flags) { } -static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ -} -static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned), + unsigned flags) { } static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, @@ -78,4 +66,17 @@ static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd } #endif +/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ +static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); +} + +static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, + void (*task_work_cb)(struct io_uring_cmd *, unsigned)) +{ + __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); +} + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 8c807bcc8b2b..bebab36abce8 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,17 @@ #include #include +enum { + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 1, +}; + enum io_uring_cmd_flags { IO_URING_F_COMPLETE_DEFER = 1, IO_URING_F_UNLOCKED = 2, diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ed84f2737b3a..ef783a2444ac 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -15,16 +15,6 @@ #include #endif -enum { - /* - * A hint to not wake right away but delay until there are enough of - * tw's queued to match the number of CQEs the task is waiting for. - * - * Must not be used wirh requests generating more than one CQE. - * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. - */ - IOU_F_TWQ_LAZY_WAKE = 1, -}; enum { IOU_OK = 0, diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 4ed0c66e3aae..94e9de1c24aa 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -78,13 +78,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, } EXPORT_SYMBOL_GPL(__io_uring_cmd_do_in_task); -void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd, - void (*task_work_cb)(struct io_uring_cmd *, unsigned)) -{ - __io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE); -} -EXPORT_SYMBOL_GPL(io_uring_cmd_do_in_task_lazy); - static inline void io_req_set_cqe32_extra(struct io_kiocb *req, u64 extra1, u64 extra2) { -- cgit v1.2.3 From 055c15626a45b1ebc9f2f34981e705e1af171236 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 1 Dec 2023 00:57:37 +0000 Subject: io_uring/cmd: inline io_uring_cmd_get_task With io_uring_types.h we see all required definitions to inline io_uring_cmd_get_task(). Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/aa8e317f09e651a5f3e72f8c0ad3902084c1f930.1701391955.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 10 +++++----- io_uring/uring_cmd.c | 6 ------ 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index ee9b3bc3a4af..d69b4038aa3e 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -39,7 +39,6 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -60,10 +59,6 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, unsigned int issue_flags) { } -static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return NULL; -} #endif /* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */ @@ -79,4 +74,9 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd, __io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0); } +static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) +{ + return cmd_to_io_kiocb(cmd)->task; +} + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 94e9de1c24aa..34030583b9b2 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -52,12 +52,6 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd) -{ - return cmd_to_io_kiocb(cmd)->task; -} -EXPORT_SYMBOL_GPL(io_uring_cmd_get_task); - static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); -- cgit v1.2.3 From dc18b89ab113e9c6c7a529316ddf7029fb55132d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 7 Dec 2023 20:06:02 -0700 Subject: io_uring/openclose: add support for IORING_OP_FIXED_FD_INSTALL io_uring can currently open/close regular files or fixed/direct descriptors. Or you can instantiate a fixed descriptor from a regular one, and then close the regular descriptor. But you currently can't turn a purely fixed/direct descriptor into a regular file descriptor. IORING_OP_FIXED_FD_INSTALL adds support for installing a direct descriptor into the normal file table, just like receiving a file descriptor or opening a new file would do. This is all nicely abstracted into receive_fd(), and hence adding support for this is truly trivial. Since direct descriptors are only usable within io_uring itself, it can be useful to turn them into real file descriptors if they ever need to be accessed via normal syscalls. This can either be a transitory thing, or just a permanent transition for a given direct descriptor. By default, new fds are installed with O_CLOEXEC set. The application can disable O_CLOEXEC by setting IORING_FIXED_FD_NO_CLOEXEC in the sqe->install_fd_flags member. Suggested-by: Christian Brauner Reviewed-by: Christian Brauner Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 9 +++++++++ io_uring/opdef.c | 9 +++++++++ io_uring/openclose.c | 44 +++++++++++++++++++++++++++++++++++++++++++ io_uring/openclose.h | 3 +++ 4 files changed, 65 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index f1c16f817742..db4b913e6b39 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -71,6 +71,7 @@ struct io_uring_sqe { __u32 uring_cmd_flags; __u32 waitid_flags; __u32 futex_flags; + __u32 install_fd_flags; }; __u64 user_data; /* data to be passed back at completion time */ /* pack this to avoid bogus arm OABI complaints */ @@ -253,6 +254,7 @@ enum io_uring_op { IORING_OP_FUTEX_WAIT, IORING_OP_FUTEX_WAKE, IORING_OP_FUTEX_WAITV, + IORING_OP_FIXED_FD_INSTALL, /* this goes last, obviously */ IORING_OP_LAST, @@ -386,6 +388,13 @@ enum { /* Pass through the flags from sqe->file_index to cqe->flags */ #define IORING_MSG_RING_FLAGS_PASS (1U << 1) +/* + * IORING_OP_FIXED_FD_INSTALL flags (sqe->install_fd_flags) + * + * IORING_FIXED_FD_NO_CLOEXEC Don't mark the fd as O_CLOEXEC + */ +#define IORING_FIXED_FD_NO_CLOEXEC (1U << 0) + /* * IO completion data structure (Completion Queue Entry) */ diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 799db44283c7..6705634e5f52 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -469,6 +469,12 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_FIXED_FD_INSTALL] = { + .needs_file = 1, + .audit_skip = 1, + .prep = io_install_fixed_fd_prep, + .issue = io_install_fixed_fd, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -704,6 +710,9 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_FUTEX_WAITV] = { .name = "FUTEX_WAITV", }, + [IORING_OP_FIXED_FD_INSTALL] = { + .name = "FIXED_FD_INSTALL", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/openclose.c b/io_uring/openclose.c index 74fc22461f48..0fe0dd305546 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -31,6 +31,11 @@ struct io_close { u32 file_slot; }; +struct io_fixed_install { + struct file *file; + unsigned int o_flags; +}; + static bool io_openat_force_async(struct io_open *open) { /* @@ -254,3 +259,42 @@ err: io_req_set_res(req, ret, 0); return IOU_OK; } + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_fixed_install *ifi; + unsigned int flags; + + if (sqe->off || sqe->addr || sqe->len || sqe->buf_index || + sqe->splice_fd_in || sqe->addr3) + return -EINVAL; + + /* must be a fixed file */ + if (!(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + flags = READ_ONCE(sqe->install_fd_flags); + if (flags & ~IORING_FIXED_FD_NO_CLOEXEC) + return -EINVAL; + + /* default to O_CLOEXEC, disable if IORING_FIXED_FD_NO_CLOEXEC is set */ + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + ifi->o_flags = O_CLOEXEC; + if (flags & IORING_FIXED_FD_NO_CLOEXEC) + ifi->o_flags = 0; + + return 0; +} + +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_fixed_install *ifi; + int ret; + + ifi = io_kiocb_to_cmd(req, struct io_fixed_install); + ret = receive_fd(req->file, NULL, ifi->o_flags); + if (ret < 0) + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_OK; +} diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 4b1c28d3a66c..8a93c98ad0ad 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -12,3 +12,6 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); + +int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); -- cgit v1.2.3 From a4104821ad651d8a0b374f0b2474c345bbb42f82 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Dec 2023 12:30:43 -0700 Subject: io_uring/unix: drop usage of io_uring socket Since we no longer allow sending io_uring fds over SCM_RIGHTS, move to using io_is_uring_fops() to detect whether this is a io_uring fd or not. With that done, kill off io_uring_get_socket() as nobody calls it anymore. This is in preparation to yanking out the rest of the core related to unix gc with io_uring. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 10 +++++----- io_uring/io_uring.c | 13 ------------- io_uring/io_uring.h | 1 - net/core/scm.c | 2 +- net/unix/scm.c | 4 +--- 5 files changed, 7 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index d8fc93492dc5..68ed6697fece 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -7,12 +7,12 @@ #include #if defined(CONFIG_IO_URING) -struct sock *io_uring_get_socket(struct file *file); void __io_uring_cancel(bool cancel_all); void __io_uring_free(struct task_struct *tsk); void io_uring_unreg_ringfd(void); const char *io_uring_get_opcode(u8 opcode); int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags); +bool io_is_uring_fops(struct file *file); static inline void io_uring_files_cancel(void) { @@ -32,10 +32,6 @@ static inline void io_uring_free(struct task_struct *tsk) __io_uring_free(tsk); } #else -static inline struct sock *io_uring_get_socket(struct file *file) -{ - return NULL; -} static inline void io_uring_task_cancel(void) { } @@ -54,6 +50,10 @@ static inline int io_uring_cmd_sock(struct io_uring_cmd *cmd, { return -EOPNOTSUPP; } +static inline bool io_is_uring_fops(struct file *file) +{ + return false; +} #endif #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2b24ce692b0b..a9acccd45880 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -171,19 +171,6 @@ static struct ctl_table kernel_io_uring_disabled_table[] = { }; #endif -struct sock *io_uring_get_socket(struct file *file) -{ -#if defined(CONFIG_UNIX) - if (io_is_uring_fops(file)) { - struct io_ring_ctx *ctx = file->private_data; - - return ctx->ring_sock->sk; - } -#endif - return NULL; -} -EXPORT_SYMBOL(io_uring_get_socket); - static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) { if (!wq_list_empty(&ctx->submit_state.compl_reqs) || diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 1112c198e516..04e33f25919c 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -44,7 +44,6 @@ struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); diff --git a/net/core/scm.c b/net/core/scm.c index db3f7cd519c2..d0e0852a24d5 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -105,7 +105,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) if (fd < 0 || !(file = fget_raw(fd))) return -EBADF; /* don't allow io_uring files */ - if (io_uring_get_socket(file)) { + if (io_is_uring_fops(file)) { fput(file); return -EINVAL; } diff --git a/net/unix/scm.c b/net/unix/scm.c index 6ff628f2349f..822ce0d0d791 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -35,10 +35,8 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && ops && ops->family == PF_UNIX) u_sock = s; - } else { - /* Could be an io_uring instance */ - u_sock = io_uring_get_socket(filp); } + return u_sock; } EXPORT_SYMBOL(unix_get_socket); -- cgit v1.2.3 From 6e5e6d274956305f1fc0340522b38f5f5be74bdb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 Dec 2023 12:36:34 -0700 Subject: io_uring: drop any code related to SCM_RIGHTS This is dead code after we dropped support for passing io_uring fds over SCM_RIGHTS, get rid of it. Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 - io_uring/filetable.c | 11 +-- io_uring/io_uring.c | 32 +------- io_uring/rsrc.c | 169 +---------------------------------------- io_uring/rsrc.h | 15 ---- 5 files changed, 10 insertions(+), 220 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index bebab36abce8..fc8f2570b92b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -389,9 +389,6 @@ struct io_ring_ctx { struct wait_queue_head rsrc_quiesce_wq; unsigned rsrc_quiesce; - #if defined(CONFIG_UNIX) - struct socket *ring_sock; - #endif /* hashed buffered write serialization */ struct io_wq_hash *hash_map; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index e7d749991de4..6e86e6188dbe 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -87,13 +87,10 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, io_file_bitmap_clear(&ctx->file_table, slot_index); } - ret = io_scm_file_account(ctx, file); - if (!ret) { - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); - io_file_bitmap_set(&ctx->file_table, slot_index); - } - return ret; + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + io_file_bitmap_set(&ctx->file_table, slot_index); + return 0; } int __io_fixed_fd_install(struct io_ring_ctx *ctx, struct file *file, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index a9acccd45880..a9a519fa9926 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -60,7 +60,6 @@ #include #include #include -#include #include #include #include @@ -2866,13 +2865,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rsrc_node_destroy(ctx, ctx->rsrc_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - ctx->ring_sock->file = NULL; /* so that iput() is called */ - sock_release(ctx->ring_sock); - } -#endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); @@ -3781,32 +3773,12 @@ static int io_uring_install_fd(struct file *file) /* * Allocate an anonymous fd, this is what constitutes the application * visible backing of an io_uring instance. The application mmaps this - * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, - * we have to tie this fd to a socket for file garbage collection purposes. + * fd to gain access to the SQ/CQ ring details. */ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) { - struct file *file; -#if defined(CONFIG_UNIX) - int ret; - - ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, - &ctx->ring_sock); - if (ret) - return ERR_PTR(ret); -#endif - - file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + return anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, O_RDWR | O_CLOEXEC, NULL); -#if defined(CONFIG_UNIX) - if (IS_ERR(file)) { - sock_release(ctx->ring_sock); - ctx->ring_sock = NULL; - } else { - ctx->ring_sock->file = file; - } -#endif - return file; } static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f521c5965a93..4818b79231dd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -24,7 +24,6 @@ struct io_rsrc_update { }; static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); @@ -157,7 +156,7 @@ static void io_rsrc_put_work(struct io_rsrc_node *node) switch (node->type) { case IORING_RSRC_FILE: - io_rsrc_file_put(node->ctx, prsrc); + fput(prsrc->file); break; case IORING_RSRC_BUFFER: io_rsrc_buf_put(node->ctx, prsrc); @@ -402,23 +401,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, break; } /* - * Don't allow io_uring instances to be registered. If - * UNIX isn't enabled, then this causes a reference - * cycle and this instance can never get freed. If UNIX - * is enabled we'll handle it just fine, but there's - * still no point in allowing a ring fd as it doesn't - * support regular read/write anyway. + * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); err = -EBADF; break; } - err = io_scm_file_account(ctx, file); - if (err) { - fput(file); - break; - } *io_get_tag_slot(data, i) = tag; io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); @@ -675,22 +664,12 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx) for (i = 0; i < ctx->nr_user_files; i++) { struct file *file = io_file_from_index(&ctx->file_table, i); - /* skip scm accounted files, they'll be freed by ->ring_sock */ - if (!file || io_file_need_scm(file)) + if (!file) continue; io_file_bitmap_clear(&ctx->file_table, i); fput(file); } -#if defined(CONFIG_UNIX) - if (ctx->ring_sock) { - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff *skb; - - while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) - kfree_skb(skb); - } -#endif io_free_file_tables(&ctx->file_table); io_file_table_set_alloc_range(ctx, 0, 0); io_rsrc_data_free(ctx->file_data); @@ -718,137 +697,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx) return ret; } -/* - * Ensure the UNIX gc is aware of our file set, so we are certain that - * the io_uring can be safely unregistered on process exit, even if we have - * loops in the file referencing. We account only files that can hold other - * files because otherwise they can't form a loop and so are not interesting - * for GC. - */ -int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sk = ctx->ring_sock->sk; - struct sk_buff_head *head = &sk->sk_receive_queue; - struct scm_fp_list *fpl; - struct sk_buff *skb; - - if (likely(!io_file_need_scm(file))) - return 0; - - /* - * See if we can merge this file into an existing skb SCM_RIGHTS - * file set. If there's no room, fall back to allocating a new skb - * and filling it in. - */ - spin_lock_irq(&head->lock); - skb = skb_peek(head); - if (skb && UNIXCB(skb).fp->count < SCM_MAX_FD) - __skb_unlink(skb, head); - else - skb = NULL; - spin_unlock_irq(&head->lock); - - if (!skb) { - fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); - if (!fpl) - return -ENOMEM; - - skb = alloc_skb(0, GFP_KERNEL); - if (!skb) { - kfree(fpl); - return -ENOMEM; - } - - fpl->user = get_uid(current_user()); - fpl->max = SCM_MAX_FD; - fpl->count = 0; - - UNIXCB(skb).fp = fpl; - skb->sk = sk; - skb->destructor = io_uring_destruct_scm; - refcount_add(skb->truesize, &sk->sk_wmem_alloc); - } - - fpl = UNIXCB(skb).fp; - fpl->fp[fpl->count++] = get_file(file); - unix_inflight(fpl->user, file); - skb_queue_head(head, skb); - fput(file); -#endif - return 0; -} - -static __cold void io_rsrc_file_scm_put(struct io_ring_ctx *ctx, struct file *file) -{ -#if defined(CONFIG_UNIX) - struct sock *sock = ctx->ring_sock->sk; - struct sk_buff_head list, *head = &sock->sk_receive_queue; - struct sk_buff *skb; - int i; - - __skb_queue_head_init(&list); - - /* - * Find the skb that holds this file in its SCM_RIGHTS. When found, - * remove this entry and rearrange the file array. - */ - skb = skb_dequeue(head); - while (skb) { - struct scm_fp_list *fp; - - fp = UNIXCB(skb).fp; - for (i = 0; i < fp->count; i++) { - int left; - - if (fp->fp[i] != file) - continue; - - unix_notinflight(fp->user, fp->fp[i]); - left = fp->count - 1 - i; - if (left) { - memmove(&fp->fp[i], &fp->fp[i + 1], - left * sizeof(struct file *)); - } - fp->count--; - if (!fp->count) { - kfree_skb(skb); - skb = NULL; - } else { - __skb_queue_tail(&list, skb); - } - fput(file); - file = NULL; - break; - } - - if (!file) - break; - - __skb_queue_tail(&list, skb); - - skb = skb_dequeue(head); - } - - if (skb_peek(&list)) { - spin_lock_irq(&head->lock); - while ((skb = __skb_dequeue(&list)) != NULL) - __skb_queue_tail(head, skb); - spin_unlock_irq(&head->lock); - } -#endif -} - -static void io_rsrc_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - struct file *file = prsrc->file; - - if (likely(!io_file_need_scm(file))) - fput(file); - else - io_rsrc_file_scm_put(ctx, file); -} - int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -897,21 +745,12 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; /* - * Don't allow io_uring instances to be registered. If UNIX - * isn't enabled, then this causes a reference cycle and this - * instance can never get freed. If UNIX is enabled we'll - * handle it just fine, but there's still no point in allowing - * a ring fd as it doesn't support regular read/write anyway. + * Don't allow io_uring instances to be registered. */ if (io_is_uring_fops(file)) { fput(file); goto fail; } - ret = io_scm_file_account(ctx, file); - if (ret) { - fput(file); - goto fail; - } file_slot = io_fixed_file_slot(&ctx->file_table, i); io_fixed_file_set(file_slot, file); io_file_bitmap_set(&ctx->file_table, i); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 08ac0d8e07ef..7238b9cfe33b 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -75,21 +75,6 @@ int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); -int __io_scm_file_account(struct io_ring_ctx *ctx, struct file *file); - -static inline bool io_file_need_scm(struct file *filp) -{ - return false; -} - -static inline int io_scm_file_account(struct io_ring_ctx *ctx, - struct file *file) -{ - if (likely(!io_file_need_scm(file))) - return 0; - return __io_scm_file_account(ctx, file); -} - int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args); int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, -- cgit v1.2.3 From d293b1a89694fc4918d9a4330a71ba2458f9d581 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 21 Dec 2023 09:02:57 -0700 Subject: io_uring/kbuf: add method for returning provided buffer ring head The tail of the provided ring buffer is shared between the kernel and the application, but the head is private to the kernel as the application doesn't need to see it. However, this also prevents the application from knowing how many buffers the kernel has consumed. Usually this is fine, as the information is inherently racy in that the kernel could be consuming buffers continually, but for cleanup purposes it may be relevant to know how many buffers are still left in the ring. Add IORING_REGISTER_PBUF_STATUS which will return status for a given provided buffer ring. Right now it just returns the head, but space is reserved for more information later in, if needed. Link: https://github.com/axboe/liburing/discussions/1020 Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 10 ++++++++++ io_uring/kbuf.c | 26 ++++++++++++++++++++++++++ io_uring/kbuf.h | 1 + io_uring/register.c | 6 ++++++ 4 files changed, 43 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index db4b913e6b39..7a673b52827b 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -567,6 +567,9 @@ enum { /* register a range of fixed file slots for automatic slot allocation */ IORING_REGISTER_FILE_ALLOC_RANGE = 25, + /* return status information for a buffer group */ + IORING_REGISTER_PBUF_STATUS = 26, + /* this goes last */ IORING_REGISTER_LAST, @@ -693,6 +696,13 @@ struct io_uring_buf_reg { __u64 resv[3]; }; +/* argument for IORING_REGISTER_PBUF_STATUS */ +struct io_uring_buf_status { + __u32 buf_group; /* input */ + __u32 head; /* output */ + __u32 resv[8]; +}; + /* * io_uring_restriction->opcode values */ diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 72b6af1d2ed3..18df5a9d2f5e 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -750,6 +750,32 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } +int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_buf_status buf_status; + struct io_buffer_list *bl; + int i; + + if (copy_from_user(&buf_status, arg, sizeof(buf_status))) + return -EFAULT; + + for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) + if (buf_status.resv[i]) + return -EINVAL; + + bl = io_buffer_get_list(ctx, buf_status.buf_group); + if (!bl) + return -ENOENT; + if (!bl->is_mapped) + return -EINVAL; + + buf_status.head = bl->head; + if (copy_to_user(arg, &buf_status, sizeof(buf_status))) + return -EFAULT; + + return 0; +} + void *io_pbuf_get_address(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 9be5960817ea..53dfaa71a397 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -53,6 +53,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); +int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); void io_kbuf_mmap_list_free(struct io_ring_ctx *ctx); diff --git a/io_uring/register.c b/io_uring/register.c index a4286029e920..708dd1d89add 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -542,6 +542,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_file_alloc_range(ctx, arg); break; + case IORING_REGISTER_PBUF_STATUS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_pbuf_status(ctx, arg); + break; default: ret = -EINVAL; break; -- cgit v1.2.3