From 5aa75ed5b93f086c455a3c67239b0471ff5a1526 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Feb 2021 12:56:50 -0700 Subject: io_uring: tie async worker side to the task context Move it outside of the io_ring_ctx, and tie it to the io_uring task context. Signed-off-by: Jens Axboe --- include/linux/io_uring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 2eb6d19de336..0e95398998b6 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -36,6 +36,7 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; struct file *last; + void *io_wq; struct percpu_counter inflight; struct io_identity __identity; struct io_identity *identity; -- cgit v1.2.3 From 3bfe6106693b6b4ba175ad1f929c4660b8f59ca8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 16 Feb 2021 14:15:30 -0700 Subject: io-wq: fork worker threads from original task Instead of using regular kthread kernel threads, create kernel threads that are like a real thread that the task would create. This ensures that we get all the context that we need, without having to carry that state around. This greatly reduces the code complexity, and the risk of missing state for a given request type. With the move away from kthread, we can also dump everything related to assigned state to the new threads. Signed-off-by: Jens Axboe --- fs/io-wq.c | 301 +++++++++++++++++--------------------------------- fs/io-wq.h | 3 +- fs/io_uring.c | 7 ++ include/linux/sched.h | 3 + 4 files changed, 116 insertions(+), 198 deletions(-) (limited to 'include') diff --git a/fs/io-wq.c b/fs/io-wq.c index ec7f1106b659..b53f569b5b4e 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,12 +13,9 @@ #include #include #include -#include #include -#include -#include -#include #include +#include #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -57,13 +54,6 @@ struct io_worker { spinlock_t lock; struct rcu_head rcu; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *cur_creds; - const struct cred *saved_creds; - struct nsproxy *restore_nsproxy; }; #if BITS_PER_LONG == 64 @@ -122,6 +112,8 @@ struct io_wq { struct completion done; struct hlist_node cpuhp_node; + + pid_t task_pid; }; static enum cpuhp_state io_wq_online; @@ -137,61 +129,6 @@ static void io_worker_release(struct io_worker *worker) wake_up_process(worker->task); } -/* - * Note: drops the wqe->lock if returning true! The caller must re-acquire - * the lock in that case. Some callers need to restart handling if this - * happens, so we can't just re-acquire the lock on behalf of the caller. - */ -static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) -{ - bool dropped_lock = false; - - if (worker->saved_creds) { - revert_creds(worker->saved_creds); - worker->cur_creds = worker->saved_creds = NULL; - } - - if (current->files) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - - task_lock(current); - current->files = NULL; - current->nsproxy = worker->restore_nsproxy; - task_unlock(current); - } - - if (current->fs) - current->fs = NULL; - - /* - * If we have an active mm, we need to drop the wq lock before unusing - * it. If we do, return true and let the caller retry the idle loop. - */ - if (worker->mm) { - if (!dropped_lock) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - } - __set_current_state(TASK_RUNNING); - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; - } - -#ifdef CONFIG_BLK_CGROUP - if (worker->blkcg_css) { - kthread_associate_blkcg(NULL); - worker->blkcg_css = NULL; - } -#endif - if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - return dropped_lock; -} - static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, struct io_wq_work *work) { @@ -237,10 +174,6 @@ static void io_worker_exit(struct io_worker *worker) raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - if (__io_worker_unuse(wqe, worker)) { - __release(&wqe->lock); - raw_spin_lock_irq(&wqe->lock); - } acct->nr_workers--; raw_spin_unlock_irq(&wqe->lock); @@ -323,14 +256,7 @@ static void io_wqe_dec_running(struct io_worker *worker) static void io_worker_start(struct io_worker *worker) { - allow_kernel_signal(SIGINT); - - current->flags |= PF_IO_WORKER; - current->fs = NULL; - current->files = NULL; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - worker->restore_nsproxy = current->nsproxy; io_wqe_inc_running(worker); } @@ -387,7 +313,7 @@ static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } - return __io_worker_unuse(wqe, worker); + return false; } static inline unsigned int io_get_work_hash(struct io_wq_work *work) @@ -426,96 +352,23 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return NULL; } -static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +static void io_flush_signals(void) { - if (worker->mm) { - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; + if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (current->task_works) + task_work_run(); + clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); } - - if (mmget_not_zero(work->identity->mm)) { - kthread_use_mm(work->identity->mm); - worker->mm = work->identity->mm; - return; - } - - /* failed grabbing mm, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; -} - -static inline void io_wq_switch_blkcg(struct io_worker *worker, - struct io_wq_work *work) -{ -#ifdef CONFIG_BLK_CGROUP - if (!(work->flags & IO_WQ_WORK_BLKCG)) - return; - if (work->identity->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->identity->blkcg_css); - worker->blkcg_css = work->identity->blkcg_css; - } -#endif -} - -static void io_wq_switch_creds(struct io_worker *worker, - struct io_wq_work *work) -{ - const struct cred *old_creds = override_creds(work->identity->creds); - - worker->cur_creds = work->identity->creds; - if (worker->saved_creds) - put_cred(old_creds); /* creds set by previous switch */ - else - worker->saved_creds = old_creds; -} - -static void io_impersonate_work(struct io_worker *worker, - struct io_wq_work *work) -{ - if ((work->flags & IO_WQ_WORK_FILES) && - current->files != work->identity->files) { - task_lock(current); - current->files = work->identity->files; - current->nsproxy = work->identity->nsproxy; - task_unlock(current); - if (!work->identity->files) { - /* failed grabbing files, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) - current->fs = work->identity->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) - io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && - worker->cur_creds != work->identity->creds) - io_wq_switch_creds(worker, work); - if (work->flags & IO_WQ_WORK_FSIZE) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; - else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - io_wq_switch_blkcg(worker, work); -#ifdef CONFIG_AUDIT - current->loginuid = work->identity->loginuid; - current->sessionid = work->identity->sessionid; -#endif } static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - /* flush pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); cond_resched(); } -#ifdef CONFIG_AUDIT - current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); - current->sessionid = AUDIT_SID_UNSET; -#endif - spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -556,7 +409,6 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); - io_impersonate_work(worker, work); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -608,10 +460,11 @@ loop: goto loop; } raw_spin_unlock_irq(&wqe->lock); - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) continue; + if (fatal_signal_pending(current)) + break; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -635,8 +488,10 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (worker->flags & IO_WORKER_F_RUNNING) @@ -652,9 +507,10 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (!(worker->flags & IO_WORKER_F_RUNNING)) @@ -662,32 +518,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock_irq(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock_irq(&worker->wqe->lock); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static int task_thread(void *data, int index) { + struct io_worker *worker = data; + struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = &wqe->acct[index]; - struct io_worker *worker; + struct io_wq *wq = wqe->wq; + char buf[TASK_COMM_LEN]; - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) - return false; + sprintf(buf, "iou-wrk-%d", wq->task_pid); + set_task_comm(current, buf); - refcount_set(&worker->ref, 1); - worker->nulls_node.pprev = NULL; - worker->wqe = wqe; - spin_lock_init(&worker->lock); + current->pf_io_worker = worker; + worker->task = current; - worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, - "io_wqe_worker-%d/%d", index, wqe->node); - if (IS_ERR(worker->task)) { - kfree(worker); - return false; - } - kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); + set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node)); + current->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -703,8 +554,58 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) if (index == IO_WQ_ACCT_UNBOUND) atomic_inc(&wq->user->processes); + io_wqe_worker(data); + do_exit(0); +} + +static int task_thread_bound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_BOUND); +} + +static int task_thread_unbound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_UNBOUND); +} + +static pid_t fork_thread(int (*fn)(void *), void *arg) +{ + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + CLONE_IO|SIGCHLD; + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return kernel_clone(&args); +} + +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +{ + struct io_worker *worker; + pid_t pid; + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) + return false; + + refcount_set(&worker->ref, 1); + worker->nulls_node.pprev = NULL; + worker->wqe = wqe; + spin_lock_init(&worker->lock); + + if (index == IO_WQ_ACCT_BOUND) + pid = fork_thread(task_thread_bound, worker); + else + pid = fork_thread(task_thread_unbound, worker); + if (pid < 0) { + kfree(worker); + return false; + } refcount_inc(&wq->refs); - wake_up_process(worker->task); return true; } @@ -756,12 +657,17 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) static int io_wq_manager(void *data) { struct io_wq *wq = data; + char buf[TASK_COMM_LEN]; int node; - refcount_set(&wq->refs, 1); + sprintf(buf, "iou-mgr-%d", wq->task_pid); + set_task_comm(current, buf); + current->flags |= PF_IO_WORKER; + wq->manager = current; + complete(&wq->done); - while (!kthread_should_stop()) { + while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { for_each_node(node) { struct io_wqe *wqe = wq->wqes[node]; bool fork_worker[2] = { false, false }; @@ -782,11 +688,13 @@ static int io_wq_manager(void *data) } set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); + if (fatal_signal_pending(current)) + set_bit(IO_WQ_BIT_EXIT, &wq->state); } if (refcount_dec_and_test(&wq->refs)) { complete(&wq->done); - return 0; + do_exit(0); } /* if ERROR is set and we get here, we have workers to wake */ if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { @@ -795,7 +703,7 @@ static int io_wq_manager(void *data) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); } - return 0; + do_exit(0); } static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, @@ -919,7 +827,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { - send_sig(SIGINT, worker->task, 1); + set_notify_signal(worker->task); match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); @@ -1075,22 +983,21 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_LIST_HEAD(&wqe->all_list); } + wq->task_pid = current->pid; init_completion(&wq->done); + refcount_set(&wq->refs, 1); - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); - if (!IS_ERR(wq->manager)) { - wake_up_process(wq->manager); + current->flags |= PF_IO_WORKER; + ret = fork_thread(io_wq_manager, wq); + current->flags &= ~PF_IO_WORKER; + if (ret >= 0) { wait_for_completion(&wq->done); - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - ret = -ENOMEM; - goto err; - } reinit_completion(&wq->done); return wq; } - ret = PTR_ERR(wq->manager); - complete(&wq->done); + if (refcount_dec_and_test(&wq->refs)) + complete(&wq->done); err: cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) @@ -1110,7 +1017,7 @@ void io_wq_destroy(struct io_wq *wq) set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) - kthread_stop(wq->manager); + wake_up_process(wq->manager); rcu_read_lock(); for_each_node(node) diff --git a/fs/io-wq.h b/fs/io-wq.h index d2cf284b4641..83d56adabd16 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -137,6 +137,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { - return in_task() && (current->flags & PF_IO_WORKER); + return in_task() && (current->flags & PF_IO_WORKER) && + current->pf_io_worker; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 31402a19fca6..9d22ec9d9406 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1633,6 +1633,9 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; + BUG_ON(!tctx); + BUG_ON(!tctx->io_wq); + trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); io_wq_enqueue(tctx->io_wq, &req->work); @@ -9240,6 +9243,10 @@ static int io_uring_flush(struct file *file, void *data) struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx = file->private_data; + /* Ignore helper thread files exit */ + if (current->flags & PF_IO_WORKER) + return 0; + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { io_uring_cancel_task_requests(ctx, NULL); io_req_caches_free(ctx, current); diff --git a/include/linux/sched.h b/include/linux/sched.h index 26f499810dfa..ef00bb22164c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -895,6 +895,9 @@ struct task_struct { /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; + /* PF_IO_WORKER */ + void *pf_io_worker; + u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME -- cgit v1.2.3 From 4379bf8bd70b5de6bba7d53015b0c36c57a634ee Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 15 Feb 2021 13:40:22 -0700 Subject: io_uring: remove io_identity We are no longer grabbing state, so no need to maintain an IO identity that we COW if there are changes. Signed-off-by: Jens Axboe --- fs/io-wq.c | 26 ++++++++++++ fs/io-wq.h | 2 +- fs/io_uring.c | 104 ++++++++++++----------------------------------- include/linux/io_uring.h | 19 --------- 4 files changed, 52 insertions(+), 99 deletions(-) (limited to 'include') diff --git a/fs/io-wq.c b/fs/io-wq.c index 41042119bf0f..acc67ed3a52c 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -53,6 +53,9 @@ struct io_worker { struct io_wq_work *cur_work; spinlock_t lock; + const struct cred *cur_creds; + const struct cred *saved_creds; + struct rcu_head rcu; }; @@ -171,6 +174,11 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; preempt_enable(); + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; + } + raw_spin_lock_irq(&wqe->lock); hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); @@ -312,6 +320,10 @@ static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) worker->flags |= IO_WORKER_F_FREE; hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; + } } static inline unsigned int io_get_work_hash(struct io_wq_work *work) @@ -359,6 +371,18 @@ static void io_flush_signals(void) } } +static void io_wq_switch_creds(struct io_worker *worker, + struct io_wq_work *work) +{ + const struct cred *old_creds = override_creds(work->creds); + + worker->cur_creds = work->creds; + if (worker->saved_creds) + put_cred(old_creds); /* creds set by previous switch */ + else + worker->saved_creds = old_creds; +} + static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { @@ -407,6 +431,8 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); + if (work->creds && worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); wq->do_work(work); io_assign_current_work(worker, NULL); diff --git a/fs/io-wq.h b/fs/io-wq.h index bbe05dd54716..584f0bd5a83d 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -78,7 +78,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct io_identity *identity; + const struct cred *creds; unsigned flags; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 6e88295758b5..6d851033e48d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1094,7 +1094,7 @@ static bool io_match_task(struct io_kiocb *head, continue; if (req->file && req->file->f_op == &io_uring_fops) return true; - if (req->work.identity->files == files) + if (req->task->files == files) return true; } return false; @@ -1218,31 +1218,6 @@ static inline void req_set_fail_links(struct io_kiocb *req) req->flags |= REQ_F_FAIL_LINK; } -/* - * None of these are dereferenced, they are simply used to check if any of - * them have changed. If we're under current and check they are still the - * same, we're fine to grab references to them for actual out-of-line use. - */ -static void io_init_identity(struct io_identity *id) -{ - id->files = current->files; - id->mm = current->mm; -#ifdef CONFIG_BLK_CGROUP - rcu_read_lock(); - id->blkcg_css = blkcg_css(); - rcu_read_unlock(); -#endif - id->creds = current_cred(); - id->nsproxy = current->nsproxy; - id->fs = current->fs; - id->fsize = rlimit(RLIMIT_FSIZE); -#ifdef CONFIG_AUDIT - id->loginuid = current->loginuid; - id->sessionid = current->sessionid; -#endif - refcount_set(&id->count, 1); -} - static inline void __io_req_init_async(struct io_kiocb *req) { memset(&req->work, 0, sizeof(req->work)); @@ -1255,17 +1230,10 @@ static inline void __io_req_init_async(struct io_kiocb *req) */ static inline void io_req_init_async(struct io_kiocb *req) { - struct io_uring_task *tctx = current->io_uring; - if (req->flags & REQ_F_WORK_INITIALIZED) return; __io_req_init_async(req); - - /* Grab a ref if this isn't our static identity */ - req->work.identity = tctx->identity; - if (tctx->identity != &tctx->__identity) - refcount_inc(&req->work.identity->count); } static void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1350,19 +1318,15 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) -{ - if (req->work.identity == &tctx->__identity) - return; - if (refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); -} - static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; + } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; struct io_uring_task *tctx = req->task->io_uring; @@ -1377,7 +1341,6 @@ static void io_req_clean_work(struct io_kiocb *req) } req->flags &= ~REQ_F_WORK_INITIALIZED; - io_put_identity(req->task->io_uring, req); } static void io_req_track_inflight(struct io_kiocb *req) @@ -1411,6 +1374,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } + if (!req->work.creds) + req->work.creds = get_current_cred(); } static void io_prep_async_link(struct io_kiocb *req) @@ -6376,9 +6341,9 @@ static void __io_queue_sqe(struct io_kiocb *req) const struct cred *old_creds = NULL; int ret; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - req->work.identity->creds != current_cred()) - old_creds = override_creds(req->work.identity->creds); + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && + req->work.creds != current_cred()) + old_creds = override_creds(req->work.creds); ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -6508,16 +6473,11 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, id = READ_ONCE(sqe->personality); if (id) { - struct io_identity *iod; - - iod = idr_find(&ctx->personality_idr, id); - if (unlikely(!iod)) - return -EINVAL; - refcount_inc(&iod->count); - __io_req_init_async(req); - get_cred(iod->creds); - req->work.identity = iod; + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); } state = &ctx->submit_state; @@ -7936,8 +7896,6 @@ static int io_uring_alloc_task_context(struct task_struct *task, tctx->last = NULL; atomic_set(&tctx->in_idle, 0); tctx->sqpoll = false; - io_init_identity(&tctx->__identity); - tctx->identity = &tctx->__identity; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -7951,9 +7909,6 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); - if (tctx->identity != &tctx->__identity) - kfree(tctx->identity); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; @@ -8593,13 +8548,11 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - struct io_identity *iod; + const struct cred *creds; - iod = idr_remove(&ctx->personality_idr, id); - if (iod) { - put_cred(iod->creds); - if (refcount_dec_and_test(&iod->count)) - kfree(iod); + creds = idr_remove(&ctx->personality_idr, id); + if (creds) { + put_cred(creds); return 0; } @@ -9300,8 +9253,7 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - struct io_identity *iod = p; - const struct cred *cred = iod->creds; + const struct cred *cred = p; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9732,21 +9684,15 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - struct io_identity *id; + const struct cred *creds; int ret; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) - return -ENOMEM; - - io_init_identity(id); - id->creds = get_current_cred(); + creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); - if (ret < 0) { - put_cred(id->creds); - kfree(id); - } + ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (ret < 0) + put_cred(creds); return ret; } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 0e95398998b6..c48fcbdc2ea8 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,23 +5,6 @@ #include #include -struct io_identity { - struct files_struct *files; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *creds; - struct nsproxy *nsproxy; - struct fs_struct *fs; - unsigned long fsize; -#ifdef CONFIG_AUDIT - kuid_t loginuid; - unsigned int sessionid; -#endif - refcount_t count; -}; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -38,8 +21,6 @@ struct io_uring_task { struct file *last; void *io_wq; struct percpu_counter inflight; - struct io_identity __identity; - struct io_identity *identity; atomic_t in_idle; bool sqpoll; -- cgit v1.2.3 From e54937963fa249595824439dc839c948188dea83 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 17 Feb 2021 10:14:21 -0700 Subject: net: remove cmsg restriction from io_uring based send/recvmsg calls No need to restrict these anymore, as the worker threads are direct clones of the original task. Hence we know for a fact that we can support anything that the regular task can. Since the only user of proto_ops->flags was to flag PROTO_CMSG_DATA_ONLY, kill the member and the flag definition too. Signed-off-by: Jens Axboe --- include/linux/net.h | 3 --- net/ipv4/af_inet.c | 1 - net/ipv6/af_inet6.c | 1 - net/socket.c | 10 ---------- 4 files changed, 15 deletions(-) (limited to 'include') diff --git a/include/linux/net.h b/include/linux/net.h index 9e2324efc26a..ba736b457a06 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -42,8 +42,6 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 -#define PROTO_CMSG_DATA_ONLY 0x0001 - #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; - unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a02ce89b56b5..1355e6c0d567 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, - .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1fb75f01756c..802f5111805a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, const struct proto_ops inet6_stream_ops = { .family = PF_INET6, - .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, diff --git a/net/socket.c b/net/socket.c index 7f0617ab5437..90a60899aae5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2411,10 +2411,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { - /* disallow ancillary data requests from this path */ - if (msg->msg_control || msg->msg_controllen) - return -EINVAL; - return ____sys_sendmsg(sock, msg, flags, NULL, 0); } @@ -2623,12 +2619,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { - if (msg->msg_control || msg->msg_controllen) { - /* disallow ancillary data reqs unless cmsg is plain data */ - if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY)) - return -EINVAL; - } - return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } -- cgit v1.2.3 From 1c0aa1fae1acb77c5f9917adb0e4cb4500b9f3a6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 20 Feb 2021 11:55:28 -0700 Subject: io_uring: flag new native workers with IORING_FEAT_NATIVE_WORKERS A few reasons to do this: - The naming of the manager and worker have changed. That's a user visible change, so makes sense to flag it. - Opening certain files that use ->signal (like /proc/self or /dev/tty) now works, and the flag tells the application upfront that this is the case. - Related to the above, using signalfd will now work as well. Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- include/uapi/linux/io_uring.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index 378cf79e66c9..cf9a5fa1ad03 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9467,7 +9467,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac4e1738a9af..2514eb6b1cf2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -262,6 +262,7 @@ struct io_uring_params { #define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) #define IORING_FEAT_EXT_ARG (1U << 8) +#define IORING_FEAT_NATIVE_WORKERS (1U << 9) /* * io_uring_register(2) opcodes and arguments -- cgit v1.2.3 From 8a378fb096a7f02943c72a428bbfd0029260efb6 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Feb 2021 12:27:49 -0700 Subject: io_uring: ensure io-wq context is always destroyed for tasks If the task ends up doing no IO, the context list is empty and we don't call into __io_uring_files_cancel() when the task exits. This can cause a leak of the io-wq structures. Ensure we always call __io_uring_files_cancel(), even if the task context list is empty. Fixes: 5aa75ed5b93f ("io_uring: tie async worker side to the task context") Signed-off-by: Jens Axboe --- fs/io_uring.c | 7 ++++--- include/linux/io_uring.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/fs/io_uring.c b/fs/io_uring.c index e62ad6bde569..0a435a6f265a 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8800,9 +8800,10 @@ void __io_uring_files_cancel(struct files_struct *files) if (files) { io_uring_remove_task_files(tctx); - } else if (tctx->io_wq && current->flags & PF_EXITING) { - io_wq_destroy(tctx->io_wq); - tctx->io_wq = NULL; + if (tctx->io_wq) { + io_wq_destroy(tctx->io_wq); + tctx->io_wq = NULL; + } } } diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index c48fcbdc2ea8..51ede771cd99 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -43,7 +43,7 @@ static inline void io_uring_task_cancel(void) } static inline void io_uring_files_cancel(struct files_struct *files) { - if (current->io_uring && !xa_empty(¤t->io_uring->xa)) + if (current->io_uring) __io_uring_files_cancel(files); } static inline void io_uring_free(struct task_struct *tsk) -- cgit v1.2.3