summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 22:40:31 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-26 22:40:31 +0300
commit5b9a7bb72fddbc5247f56ede55d485fab7abdf92 (patch)
treef2871b8c63b876eeda5ce8f8cef4b590d772692c /include
parent5c7ecada25d2086aee607ff7deb69e77faa4aa92 (diff)
parent3c85cc43c8e7855d202da184baf00c7b8eeacf71 (diff)
downloadlinux-5b9a7bb72fddbc5247f56ede55d485fab7abdf92.tar.xz
Merge tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux
Pull io_uring updates from Jens Axboe: - Cleanup of the io-wq per-node mapping, notably getting rid of it so we just have a single io_wq entry per ring (Breno) - Followup to the above, move accounting to io_wq as well and completely drop struct io_wqe (Gabriel) - Enable KASAN for the internal io_uring caches (Breno) - Add support for multishot timeouts. Some applications use timeouts to wake someone waiting on completion entries, and this makes it a bit easier to just have a recurring timer rather than needing to rearm it every time (David) - Support archs that have shared cache coloring between userspace and the kernel, and hence have strict address requirements for mmap'ing the ring into userspace. This should only be parisc/hppa. (Helge, me) - XFS has supported O_DIRECT writes without needing to lock the inode exclusively for a long time, and ext4 now supports it as well. This is true for the common cases of not extending the file size. Flag the fs as having that feature, and utilize that to avoid serializing those writes in io_uring (me) - Enable completion batching for uring commands (me) - Revert patch adding io_uring restriction to what can be GUP mapped or not. This does not belong in io_uring, as io_uring isn't really special in this regard. Since this is also getting in the way of cleanups and improvements to the GUP code, get rid of if (me) - A few series greatly reducing the complexity of registered resources, like buffers or files. Not only does this clean up the code a lot, the simplified code is also a LOT more efficient (Pavel) - Series optimizing how we wait for events and run task_work related to it (Pavel) - Fixes for file/buffer unregistration with DEFER_TASKRUN (Pavel) - Misc cleanups and improvements (Pavel, me) * tag 'for-6.4/io_uring-2023-04-21' of git://git.kernel.dk/linux: (71 commits) Revert "io_uring/rsrc: disallow multi-source reg buffers" io_uring: add support for multishot timeouts io_uring/rsrc: disassociate nodes and rsrc_data io_uring/rsrc: devirtualise rsrc put callbacks io_uring/rsrc: pass node to io_rsrc_put_work() io_uring/rsrc: inline io_rsrc_put_work() io_uring/rsrc: add empty flag in rsrc_node io_uring/rsrc: merge nodes and io_rsrc_put io_uring/rsrc: infer node from ctx on io_queue_rsrc_removal io_uring/rsrc: remove unused io_rsrc_node::llist io_uring/rsrc: refactor io_queue_rsrc_removal io_uring/rsrc: simplify single file node switching io_uring/rsrc: clean up __io_sqe_buffers_update() io_uring/rsrc: inline switch_start fast path io_uring/rsrc: remove rsrc_data refs io_uring/rsrc: fix DEFER_TASKRUN rsrc quiesce io_uring/rsrc: use wq for quiescing io_uring/rsrc: refactor io_rsrc_ref_quiesce io_uring/rsrc: remove io_rsrc_node::done io_uring/rsrc: use nospec'ed indexes ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/fs.h3
-rw-r--r--include/linux/io_uring_types.h24
-rw-r--r--include/trace/events/io_uring.h15
-rw-r--r--include/uapi/linux/io_uring.h33
4 files changed, 43 insertions, 32 deletions
diff --git a/include/linux/fs.h b/include/linux/fs.h
index ef2281a2acce..67495ef79bb2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_NOREUSE ((__force fmode_t)0x800000)
+/* File supports non-exclusive O_DIRECT writes from multiple threads */
+#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000)
+
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 00689c12f6ab..1b2a20a42413 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -188,8 +188,10 @@ struct io_ev_fd {
};
struct io_alloc_cache {
- struct hlist_head list;
+ struct io_wq_work_node list;
unsigned int nr_cached;
+ unsigned int max_cached;
+ size_t elem_size;
};
struct io_ring_ctx {
@@ -239,7 +241,6 @@ struct io_ring_ctx {
* uring_lock, and updated through io_uring_register(2)
*/
struct io_rsrc_node *rsrc_node;
- int rsrc_cached_refs;
atomic_t cancel_seq;
struct io_file_table file_table;
unsigned nr_user_files;
@@ -295,7 +296,7 @@ struct io_ring_ctx {
spinlock_t completion_lock;
bool poll_multi_queue;
- bool cq_waiting;
+ atomic_t cq_wait_nr;
/*
* ->iopoll_list is protected by the ctx->uring_lock for
@@ -325,16 +326,15 @@ struct io_ring_ctx {
struct io_restriction restrictions;
/* slow path rsrc auxilary data, used by update/register */
- struct io_rsrc_node *rsrc_backup_node;
struct io_mapped_ubuf *dummy_ubuf;
struct io_rsrc_data *file_data;
struct io_rsrc_data *buf_data;
- struct delayed_work rsrc_put_work;
- struct callback_head rsrc_put_tw;
- struct llist_head rsrc_put_llist;
+ /* protected by ->uring_lock */
struct list_head rsrc_ref_list;
- spinlock_t rsrc_ref_lock;
+ struct io_alloc_cache rsrc_node_cache;
+ struct wait_queue_head rsrc_quiesce_wq;
+ unsigned rsrc_quiesce;
struct list_head io_buffers_pages;
@@ -366,6 +366,11 @@ struct io_ring_ctx {
unsigned evfd_last_cq_tail;
};
+struct io_tw_state {
+ /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */
+ bool locked;
+};
+
enum {
REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT,
REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT,
@@ -472,7 +477,7 @@ enum {
REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT),
};
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
struct io_task_work {
struct llist_node node;
@@ -562,6 +567,7 @@ struct io_kiocb {
atomic_t refs;
atomic_t poll_refs;
struct io_task_work io_task_work;
+ unsigned nr_tw;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
union {
struct hlist_node hash_node;
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 936fd41bf147..69454f1f98b0 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -360,19 +360,18 @@ TRACE_EVENT(io_uring_complete,
);
/**
- * io_uring_submit_sqe - called before submitting one SQE
+ * io_uring_submit_req - called before submitting a request
*
* @req: pointer to a submitted request
- * @force_nonblock: whether a context blocking or not
*
* Allows to track SQE submitting, to understand what was the source of it, SQ
* thread or io_uring_enter call.
*/
-TRACE_EVENT(io_uring_submit_sqe,
+TRACE_EVENT(io_uring_submit_req,
- TP_PROTO(struct io_kiocb *req, bool force_nonblock),
+ TP_PROTO(struct io_kiocb *req),
- TP_ARGS(req, force_nonblock),
+ TP_ARGS(req),
TP_STRUCT__entry (
__field( void *, ctx )
@@ -380,7 +379,6 @@ TRACE_EVENT(io_uring_submit_sqe,
__field( unsigned long long, user_data )
__field( u8, opcode )
__field( u32, flags )
- __field( bool, force_nonblock )
__field( bool, sq_thread )
__string( op_str, io_uring_get_opcode(req->opcode) )
@@ -392,16 +390,15 @@ TRACE_EVENT(io_uring_submit_sqe,
__entry->user_data = req->cqe.user_data;
__entry->opcode = req->opcode;
__entry->flags = req->flags;
- __entry->force_nonblock = force_nonblock;
__entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL;
__assign_str(op_str, io_uring_get_opcode(req->opcode));
),
TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%x, "
- "non block %d, sq_thread %d", __entry->ctx, __entry->req,
+ "sq_thread %d", __entry->ctx, __entry->req,
__entry->user_data, __get_str(op_str),
- __entry->flags, __entry->force_nonblock, __entry->sq_thread)
+ __entry->flags, __entry->sq_thread)
);
/*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 709de6d4feb2..0716cb17e436 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -250,6 +250,7 @@ enum io_uring_op {
#define IORING_TIMEOUT_REALTIME (1U << 3)
#define IORING_LINK_TIMEOUT_UPDATE (1U << 4)
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
+#define IORING_TIMEOUT_MULTISHOT (1U << 6)
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
/*
@@ -389,6 +390,9 @@ enum {
#define IORING_OFF_SQ_RING 0ULL
#define IORING_OFF_CQ_RING 0x8000000ULL
#define IORING_OFF_SQES 0x10000000ULL
+#define IORING_OFF_PBUF_RING 0x80000000ULL
+#define IORING_OFF_PBUF_SHIFT 16
+#define IORING_OFF_MMAP_MASK 0xf8000000ULL
/*
* Filled with the offset for mmap(2)
@@ -568,19 +572,6 @@ struct io_uring_rsrc_update2 {
__u32 resv2;
};
-struct io_uring_notification_slot {
- __u64 tag;
- __u64 resv[3];
-};
-
-struct io_uring_notification_register {
- __u32 nr_slots;
- __u32 resv;
- __u64 resv2;
- __u64 data;
- __u64 resv3;
-};
-
/* Skip updating fd indexes set to this value in the fd table */
#define IORING_REGISTER_FILES_SKIP (-2)
@@ -635,12 +626,26 @@ struct io_uring_buf_ring {
};
};
+/*
+ * Flags for IORING_REGISTER_PBUF_RING.
+ *
+ * IOU_PBUF_RING_MMAP: If set, kernel will allocate the memory for the ring.
+ * The application must not set a ring_addr in struct
+ * io_uring_buf_reg, instead it must subsequently call
+ * mmap(2) with the offset set as:
+ * IORING_OFF_PBUF_RING | (bgid << IORING_OFF_PBUF_SHIFT)
+ * to get a virtual mapping for the ring.
+ */
+enum {
+ IOU_PBUF_RING_MMAP = 1,
+};
+
/* argument for IORING_(UN)REGISTER_PBUF_RING */
struct io_uring_buf_reg {
__u64 ring_addr;
__u32 ring_entries;
__u16 bgid;
- __u16 pad;
+ __u16 flags;
__u64 resv[3];
};