From 5c251e9dc0e127bac6fc5b8e6696363d2e35f515 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:27 -0600 Subject: signal: Add task_sigpending() helper This is in preparation for maintaining signal_pending() as the decider of whether or not a schedule() loop should be broken, or continue sleeping. This is different than the core signal use cases, which really need to know whether an actual signal is pending or not. task_sigpending() returns non-zero if TIF_SIGPENDING is set. Only core kernel use cases should care about the distinction between the two, make sure those use the task_sigpending() helper. Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-2-axboe@kernel.dk --- include/linux/sched/signal.h | 9 +++++++-- kernel/events/uprobes.c | 2 +- kernel/signal.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1bad18a1d8ba..404145dc536e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -353,11 +353,16 @@ static inline int restart_syscall(void) return -ERESTARTNOINTR; } -static inline int signal_pending(struct task_struct *p) +static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } +static inline int signal_pending(struct task_struct *p) +{ + return task_sigpending(p); +} + static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); @@ -365,7 +370,7 @@ static inline int __fatal_signal_pending(struct task_struct *p) static inline int fatal_signal_pending(struct task_struct *p) { - return signal_pending(p) && __fatal_signal_pending(p); + return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(long state, struct task_struct *p) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 649fd53dc9ad..edd0c985a939 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); - if (signal_pending(t)) { + if (task_sigpending(t)) { spin_lock_irq(&t->sighand->siglock); clear_tsk_thread_flag(t, TIF_SIGPENDING); spin_unlock_irq(&t->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index 42b67d2cea37..b179eccc86d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -983,7 +983,7 @@ static inline bool wants_signal(int sig, struct task_struct *p) if (task_is_stopped_or_traced(p)) return false; - return task_curr(p) || !signal_pending(p); + return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) @@ -2822,7 +2822,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); - if (!signal_pending(t)) + if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) @@ -2856,7 +2856,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); - if (!signal_pending(tsk)) + if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; @@ -2900,7 +2900,7 @@ long do_no_restart_syscall(struct restart_block *param) static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { - if (signal_pending(tsk) && !thread_group_empty(tsk)) { + if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, ¤t->blocked); -- cgit v1.2.3 From 12db8b690010ccfadf9d0b49a1e1798e47dbbe1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:28 -0600 Subject: entry: Add support for TIF_NOTIFY_SIGNAL Add TIF_NOTIFY_SIGNAL handling in the generic entry code, which if set, will return true if signal_pending() is used in a wait loop. That causes an exit of the loop so that notify_signal tracehooks can be run. If the wait loop is currently inside a system call, the system call is restarted once task_work has been processed. In preparation for only having arch_do_signal() handle syscall restarts if _TIF_SIGPENDING isn't set, rename it to arch_do_signal_or_restart(). Pass in a boolean that tells the architecture specific signal handler if it should attempt to get a signal, or just process a potential syscall restart. For !CONFIG_GENERIC_ENTRY archs, add the TIF_NOTIFY_SIGNAL handling to get_signal(). This is done to minimize the needed architecture changes to support this feature. Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-3-axboe@kernel.dk --- arch/x86/kernel/signal.c | 4 ++-- include/linux/entry-common.h | 11 ++++++++--- include/linux/entry-kvm.h | 4 ++-- include/linux/sched/signal.h | 11 ++++++++++- include/linux/tracehook.h | 27 +++++++++++++++++++++++++++ kernel/entry/common.c | 14 +++++++++++--- kernel/entry/kvm.c | 3 +++ kernel/signal.c | 14 ++++++++++++++ 8 files changed, 77 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5fa494c2304..ec3b9c6e5a4c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; - if (get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index efebbffcd5cc..c7bfac45f951 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -37,6 +37,10 @@ # define _TIF_UPROBE (0) #endif +#ifndef _TIF_NOTIFY_SIGNAL +# define _TIF_NOTIFY_SIGNAL (0) +#endif + /* * TIF flags handled in syscall_enter_from_usermode() */ @@ -69,7 +73,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @@ -226,12 +230,13 @@ static __always_inline void arch_exit_to_user_mode(void) { } #endif /** - * arch_do_signal - Architecture specific signal delivery function + * arch_do_signal_or_restart - Architecture specific signal delivery function * @regs: Pointer to currents pt_regs + * @has_signal: actual signal to handle * * Invoked from exit_to_user_mode_loop(). */ -void arch_do_signal(struct pt_regs *regs); +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal); /** * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit() diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 0cef17afb41a..9b93f8584ff7 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -11,8 +11,8 @@ # define ARCH_XFER_TO_GUEST_MODE_WORK (0) #endif -#define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define XFER_TO_GUEST_MODE_WORK \ + (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 404145dc536e..bd5afa076189 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -360,6 +360,15 @@ static inline int task_sigpending(struct task_struct *p) static inline int signal_pending(struct task_struct *p) { +#if defined(TIF_NOTIFY_SIGNAL) + /* + * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same + * behavior in terms of ensuring that we break out of wait loops + * so that notify signal callbacks can be processed. + */ + if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) + return 1; +#endif return task_sigpending(p); } @@ -507,7 +516,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); + WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 36fb3bbed6b2..1e8caca92e1f 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -198,4 +198,31 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) blkcg_maybe_throttle_current(); } +/* + * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This + * is currently used by TWA_SIGNAL based task_work, which requires breaking + * wait loops to ensure that task_work is noticed and run. + */ +static inline void tracehook_notify_signal(void) +{ +#if defined(TIF_NOTIFY_SIGNAL) + clear_thread_flag(TIF_NOTIFY_SIGNAL); + smp_mb__after_atomic(); + if (current->task_works) + task_work_run(); +#endif +} + +/* + * Called when we have work to process from exit_to_user_mode_loop() + */ +static inline void set_notify_signal(struct task_struct *task) +{ +#if defined(TIF_NOTIFY_SIGNAL) + if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE)) + kick_process(task); +#endif +} + #endif /* */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..42eff115c426 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -109,7 +109,15 @@ static __always_inline void exit_to_user_mode(void) } /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal(struct pt_regs *regs) { } +void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } + +static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) +{ + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + + arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); +} static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) @@ -131,8 +139,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_PATCH_PENDING) klp_update_patch_state(current); - if (ti_work & _TIF_SIGPENDING) - arch_do_signal(regs); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + handle_signal_work(regs, ti_work); if (ti_work & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index eb1a8a4c867c..b828a3ddebf1 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; diff --git a/kernel/signal.c b/kernel/signal.c index b179eccc86d0..61b377e65c46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2529,6 +2529,20 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; + /* + * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so + * that the arch handlers don't all have to do it. If we get here + * without TIF_SIGPENDING, just exit after running signal work. + */ +#ifdef TIF_NOTIFY_SIGNAL + if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + tracehook_notify_signal(); + if (!task_sigpending(current)) + return false; + } +#endif + if (unlikely(uprobe_deny_signal())) return false; -- cgit v1.2.3 From 114518eb6430b832d2f9f5a008043b913ccf0e24 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:30 -0600 Subject: task_work: Use TIF_NOTIFY_SIGNAL if available If the arch supports TIF_NOTIFY_SIGNAL, then use that for TWA_SIGNAL as it's more efficient than using the signal delivery method. This is especially true on threaded applications, where ->sighand is shared across threads, but it's also lighter weight on non-shared cases. io_uring is a heavy consumer of TWA_SIGNAL based task_work. A test with threads shows a nice improvement running an io_uring based echo server. stock kernel: 0.01% <= 0.1 milliseconds 95.86% <= 0.2 milliseconds 98.27% <= 0.3 milliseconds 99.71% <= 0.4 milliseconds 100.00% <= 0.5 milliseconds 100.00% <= 0.6 milliseconds 100.00% <= 0.7 milliseconds 100.00% <= 0.8 milliseconds 100.00% <= 0.9 milliseconds 100.00% <= 1.0 milliseconds 100.00% <= 1.1 milliseconds 100.00% <= 2 milliseconds 100.00% <= 3 milliseconds 100.00% <= 3 milliseconds 1378930.00 requests per second ~1600% CPU 1.38M requests/second, and all 16 CPUs are maxed out. patched kernel: 0.01% <= 0.1 milliseconds 98.24% <= 0.2 milliseconds 99.47% <= 0.3 milliseconds 99.99% <= 0.4 milliseconds 100.00% <= 0.5 milliseconds 100.00% <= 0.6 milliseconds 100.00% <= 0.7 milliseconds 100.00% <= 0.8 milliseconds 100.00% <= 0.9 milliseconds 100.00% <= 1.2 milliseconds 1666111.38 requests per second ~1450% CPU 1.67M requests/second, and we're no longer just hammering on the sighand lock. The original reporter states: "For 5.7.15 my benchmark achieves 1.6M qps and system cpu is at ~80%. for 5.7.16 or later it achieves only 1M qps and the system cpu is is at ~100%" with the only difference there being that TWA_SIGNAL is used unconditionally in 5.7.16, since it's required to be able to handle the inability to run task_work if the application is waiting in the kernel already on an event that needs task_work run to be satisfied. Also see commit 0ba9c9edcd15. Reported-by: Roman Gershman Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-5-axboe@kernel.dk --- kernel/task_work.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 613b2d634af8..ae058893913c 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,6 +5,34 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/* + * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster + * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is + * shared for threads, and can cause contention on sighand->lock. Even for + * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking + * or IRQ disabling is involved for notification (or running) purposes. + */ +static void task_work_notify_signal(struct task_struct *task) +{ +#if defined(TIF_NOTIFY_SIGNAL) + set_notify_signal(task); +#else + unsigned long flags; + + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } +#endif +} + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -28,7 +56,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; - unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -42,17 +69,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - /* - * Only grab the sighand lock if we don't already have some - * task_work pending. This pairs with the smp_store_mb() - * in get_signal(), see comment there. - */ - if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && - lock_task_sighand(task, &flags)) { - task->jobctl |= JOBCTL_TASK_WORK; - signal_wake_up(task, 0); - unlock_task_sighand(task, &flags); - } + task_work_notify_signal(task); break; } -- cgit v1.2.3 From c8d5ed67936fddbe2ae845fc80397718006322d7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:29 -0600 Subject: x86: Wire up TIF_NOTIFY_SIGNAL The generic entry code has support for TIF_NOTIFY_SIGNAL already. Just provide the TIF bit. [ tglx: Adopted to other TIF changes in x86 ] Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201026203230.386348-4-axboe@kernel.dk --- arch/x86/include/asm/thread_info.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a12b9644193b..06a171037558 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -91,6 +91,7 @@ struct thread_info { #define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ +#define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -118,6 +119,7 @@ struct thread_info { #define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) +#define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) #define _TIF_SLD (1 << TIF_SLD) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) -- cgit v1.2.3 From 45ff510517f3b1354a3d9c273ad5e5e8d08312cb Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 28 Oct 2020 09:36:32 -0700 Subject: entry: Fixup irqentry_enter() comment irq_enter_from_user_mode() was changed to irqentry_enter_from_user_mode(). Update the comment within irqentry_enter() to reflect this change. Suggested-by: Thomas Gleixner Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201028163632.965518-1-ira.weiny@intel.com --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 42eff115c426..f7ed415a6768 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -302,7 +302,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required - * as in irq_enter_from_user_mode(). + * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); -- cgit v1.2.3 From b6be002bcd1dd1dedb926abf3c90c794eacb77dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Nov 2020 12:53:16 -0800 Subject: x86/entry: Move nmi entry/exit into common code Lockdep state handling on NMI enter and exit is nothing specific to X86. It's not any different on other architectures. Also the extra state type is not necessary, irqentry_state_t can carry the necessary information as well. Move it to common code and extend irqentry_state_t to carry lockdep state. [ Ira: Make exit_rcu and lockdep a union as they are mutually exclusive between the IRQ and NMI exceptions, and add kernel documentation for struct irqentry_state_t ] Signed-off-by: Thomas Gleixner Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201102205320.1458656-7-ira.weiny@intel.com --- arch/x86/entry/common.c | 34 ---------------------------------- arch/x86/include/asm/idtentry.h | 3 --- arch/x86/kernel/cpu/mce/core.c | 6 +++--- arch/x86/kernel/nmi.c | 6 +++--- arch/x86/kernel/traps.c | 13 +++++++------ include/linux/entry-common.h | 39 ++++++++++++++++++++++++++++++++++++++- kernel/entry/common.c | 36 ++++++++++++++++++++++++++++++++++++ 7 files changed, 87 insertions(+), 50 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..18d8f17f755c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -noinstr bool idtentry_enter_nmi(struct pt_regs *regs) -{ - bool irq_state = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - rcu_nmi_enter(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - instrumentation_end(); - - return irq_state; -} - -noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) -{ - instrumentation_begin(); - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - } - instrumentation_end(); - - rcu_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2442eb0ac2f..247a60a47331 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,9 +11,6 @@ #include -bool idtentry_enter_nmi(struct pt_regs *regs); -void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4102b866e7c0..f5c860b1a50b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1983,7 +1983,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - bool irq_state; + irqentry_state_t irq_state; WARN_ON_ONCE(user_mode(regs)); @@ -1995,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -2006,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bc77aaf1303..bf250a339655 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - bool irq_state; + irqentry_state_t irq_state; /* * Re-enable NMIs right here when running as an SEV-ES guest. This might @@ -502,14 +502,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e19df6cde35d..e1b78829d909 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - idtentry_enter_nmi(regs); + irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -651,12 +651,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } } @@ -851,7 +852,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* @@ -908,7 +909,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } @@ -926,7 +927,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* * NB: We can't easily clear DR7 here because - * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b9711e813ec2..1a128baf3628 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -346,8 +346,26 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); void irqentry_exit_to_user_mode(struct pt_regs *regs); #ifndef irqentry_state +/** + * struct irqentry_state - Opaque object for exception state storage + * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the + * exit path has to invoke rcu_irq_exit(). + * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that + * lockdep state is restored correctly on exit from nmi. + * + * This opaque object is filled in by the irqentry_*_enter() functions and + * must be passed back into the corresponding irqentry_*_exit() functions + * when the exception is complete. + * + * Callers of irqentry_*_[enter|exit]() must consider this structure opaque + * and all members private. Descriptions of the members are provided to aid in + * the maintenance of the irqentry_*() functions. + */ typedef struct irqentry_state { - bool exit_rcu; + union { + bool exit_rcu; + bool lockdep; + }; } irqentry_state_t; #endif @@ -407,4 +425,23 @@ void irqentry_exit_cond_resched(void); */ void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); +/** + * irqentry_nmi_enter - Handle NMI entry + * @regs: Pointer to currents pt_regs + * + * Similar to irqentry_enter() but taking care of the NMI constraints. + */ +irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); + +/** + * irqentry_nmi_exit - Handle return from NMI handling + * @regs: Pointer to pt_regs (NMI entry regs) + * @irq_state: Return value from matching call to irqentry_nmi_enter() + * + * Last action before returning to the low level assmenbly code. + * + * Counterpart to irqentry_nmi_enter(). + */ +void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); + #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 3a1dfecc533e..bc75c114c1b3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -405,3 +405,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) rcu_irq_exit(); } } + +irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) +{ + irqentry_state_t irq_state; + + irq_state.lockdep = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (irq_state.lockdep) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (irq_state.lockdep) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} -- cgit v1.2.3 From 78a56e0494ad29feccd4c54c2b5682721f8cb988 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 4 Nov 2020 15:01:57 -0800 Subject: entry: Fix spelling/typo errors in irq entry code s/reguired/required/ s/Interupts/Interrupts/ s/quiescient/quiescent/ s/assemenbly/assembly/ Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201104230157.3378023-1-ira.weiny@intel.com --- include/linux/entry-common.h | 4 ++-- kernel/entry/common.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 1a128baf3628..aab549026ab8 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -415,7 +415,7 @@ void irqentry_exit_cond_resched(void); * @state: Return value from matching call to irqentry_enter() * * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and reguired and returns to + * preemption and work checks if possible and required and returns to * the caller with interrupts disabled and no further work pending. * * This is the last action before returning to the low level ASM code which @@ -438,7 +438,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); * @regs: Pointer to pt_regs (NMI entry regs) * @irq_state: Return value from matching call to irqentry_nmi_enter() * - * Last action before returning to the low level assmenbly code. + * Last action before returning to the low level assembly code. * * Counterpart to irqentry_nmi_enter(). */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bc75c114c1b3..fa17baadf63e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -304,7 +304,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * If this entry hit the idle task invoke rcu_irq_enter() whether * RCU is watching or not. * - * Interupts can nest when the first interrupt invokes softirq + * Interrupts can nest when the first interrupt invokes softirq * processing on return which enables interrupts. * * Scheduler ticks in the idle task can mark quiescent state and @@ -315,7 +315,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. + * quiescent state and end grace periods prematurely. * * Unconditionally invoke rcu_irq_enter() so RCU state stays * consistent. -- cgit v1.2.3 From b4581a52caff79eab1ea6caaaa4e08526ce2782b Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:41:57 -0500 Subject: x86: Expose syscall_work field in thread_info This field will be used by SYSCALL_WORK flags, migrated from TI flags. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-2-krisman@collabora.com --- arch/x86/include/asm/thread_info.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 06a171037558..0da5d58d7c79 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,6 +55,7 @@ struct task_struct; struct thread_info { unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ u32 status; /* thread synchronous flags */ }; -- cgit v1.2.3 From 3136b93c3fb2b7c19e853e049203ff8f2b9dd2cd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:41:58 -0500 Subject: entry: Expose helpers to migrate TIF to SYSCALL_WORK flags With the goal to split the syscall work related flags into a separate field that is architecture independent, expose transitional helpers that resolve to either the TIF flags or to the corresponding SYSCALL_WORK flags. This will allow architectures to migrate only when they port to the generic syscall entry code. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-3-krisman@collabora.com --- include/linux/thread_info.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e93e249a4e9b..0e9fb15d6b42 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -97,6 +97,38 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) +#ifdef CONFIG_GENERIC_ENTRY +#define set_syscall_work(fl) \ + set_bit(SYSCALL_WORK_BIT_##fl, ¤t_thread_info()->syscall_work) +#define test_syscall_work(fl) \ + test_bit(SYSCALL_WORK_BIT_##fl, ¤t_thread_info()->syscall_work) +#define clear_syscall_work(fl) \ + clear_bit(SYSCALL_WORK_BIT_##fl, ¤t_thread_info()->syscall_work) + +#define set_task_syscall_work(t, fl) \ + set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) +#define test_task_syscall_work(t, fl) \ + test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) +#define clear_task_syscall_work(t, fl) \ + clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work) + +#else /* CONFIG_GENERIC_ENTRY */ + +#define set_syscall_work(fl) \ + set_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) +#define test_syscall_work(fl) \ + test_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) +#define clear_syscall_work(fl) \ + clear_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) + +#define set_task_syscall_work(t, fl) \ + set_ti_thread_flag(task_thread_info(t), TIF_##fl) +#define test_task_syscall_work(t, fl) \ + test_ti_thread_flag(task_thread_info(t), TIF_##fl) +#define clear_task_syscall_work(t, fl) \ + clear_ti_thread_flag(task_thread_info(t), TIF_##fl) +#endif /* !CONFIG_GENERIC_ENTRY */ + #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES -- cgit v1.2.3 From b86678cf0f1d76062aa964c5f0c6c89fe5a6dcfd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:41:59 -0500 Subject: entry: Wire up syscall_work in common entry code Prepare the common entry code to use the SYSCALL_WORK flags. They will be defined in subsequent patches for each type of syscall work. SYSCALL_WORK_ENTRY/EXIT are defined for the transition, as they will replace the TIF_ equivalent defines. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-4-krisman@collabora.com --- include/linux/entry-common.h | 3 +++ kernel/entry/common.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index aab549026ab8..3fe8f868f15e 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -64,6 +64,9 @@ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) +#define SYSCALL_WORK_ENTER (0) +#define SYSCALL_WORK_EXIT (0) + /* * TIF flags handled in exit_to_user_mode_loop() */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fa17baadf63e..e7a11e38daba 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long ti_work) + unsigned long ti_work, unsigned long work) { long ret = 0; @@ -74,11 +74,12 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, static __always_inline long __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long ti_work; ti_work = READ_ONCE(current_thread_info()->flags); - if (ti_work & SYSCALL_ENTER_WORK) - syscall = syscall_trace_enter(regs, syscall, ti_work); + if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work, work); return syscall; } @@ -225,7 +226,8 @@ static inline bool report_single_step(unsigned long ti_work) } #endif -static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, + unsigned long work) { bool step; @@ -245,6 +247,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) */ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); u32 cached_flags = READ_ONCE(current_thread_info()->flags); unsigned long nr = syscall_get_nr(current, regs); @@ -262,8 +265,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) - syscall_exit_work(regs, cached_flags); + if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags, work); } __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From 23d67a54857a768acdb0804cdd6037c324a50ecd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:00 -0500 Subject: seccomp: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SECCOMP, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-5-krisman@collabora.com --- include/asm-generic/syscall.h | 2 +- include/linux/entry-common.h | 8 ++------ include/linux/seccomp.h | 2 +- include/linux/thread_info.h | 6 ++++++ kernel/entry/common.c | 2 +- kernel/fork.c | 2 +- kernel/seccomp.c | 6 +++--- 7 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index f3135e734387..524d8e68ff5e 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -135,7 +135,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP. + * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 3fe8f868f15e..fa3cdb102dbf 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -21,10 +21,6 @@ # define _TIF_SYSCALL_TRACEPOINT (0) #endif -#ifndef _TIF_SECCOMP -# define _TIF_SECCOMP (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -49,7 +45,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -64,7 +60,7 @@ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) -#define SYSCALL_WORK_ENTER (0) +#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP) #define SYSCALL_WORK_EXIT (0) /* diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..47763f3999f7 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -42,7 +42,7 @@ struct seccomp { extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) { - if (unlikely(test_thread_flag(TIF_SECCOMP))) + if (unlikely(test_syscall_work(SECCOMP))) return __secure_computing(NULL); return 0; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0e9fb15d6b42..a308ba4ef07b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -35,6 +35,12 @@ enum { GOOD_STACK, }; +enum syscall_work_bit { + SYSCALL_WORK_BIT_SECCOMP, +}; + +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) + #include #ifdef __KERNEL__ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e7a11e38daba..5747a6eb2c48 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -54,7 +54,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, } /* Do seccomp after ptrace, to catch any tracer changes. */ - if (ti_work & _TIF_SECCOMP) { + if (work & SYSCALL_WORK_SECCOMP) { ret = __secure_computing(NULL); if (ret == -1L) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..bc5b1090f415 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1625,7 +1625,7 @@ static void copy_seccomp(struct task_struct *p) * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) - set_tsk_thread_flag(p, TIF_SECCOMP); + set_task_syscall_work(p, SECCOMP); #endif } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..f67e92d11ad7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task, task->seccomp.mode = seccomp_mode; /* - * Make sure TIF_SECCOMP cannot be set before the mode (and + * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); - set_tsk_thread_flag(task, TIF_SECCOMP); + set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER @@ -929,7 +929,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* * Make sure that any changes to mode from another thread have - * been seen after TIF_SECCOMP was seen. + * been seen after SYSCALL_WORK_SECCOMP was seen. */ rmb(); -- cgit v1.2.3 From 524666cb5de7c38a1925e7401a6e59d68682dd8c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:01 -0500 Subject: tracepoints: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACEPOINT, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-6-krisman@collabora.com --- include/linux/entry-common.h | 13 +++++-------- include/linux/thread_info.h | 2 ++ include/trace/syscall.h | 6 +++--- kernel/entry/common.c | 4 ++-- kernel/trace/trace_events.c | 8 ++++---- kernel/tracepoint.c | 4 ++-- 6 files changed, 18 insertions(+), 19 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index fa3cdb102dbf..2a01eee2dbba 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -17,10 +17,6 @@ # define _TIF_SYSCALL_EMU (0) #endif -#ifndef _TIF_SYSCALL_TRACEPOINT -# define _TIF_SYSCALL_TRACEPOINT (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -46,7 +42,7 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ + _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,10 +54,11 @@ #define SYSCALL_EXIT_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) + ARCH_SYSCALL_EXIT_WORK) -#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP) -#define SYSCALL_WORK_EXIT (0) +#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ + SYSCALL_WORK_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index a308ba4ef07b..c232043c12d3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -37,9 +37,11 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, + SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #include diff --git a/include/trace/syscall.h b/include/trace/syscall.h index dc8ac27d27c1..8e193f3a33b3 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -37,10 +37,10 @@ struct syscall_metadata { #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) static inline void syscall_tracepoint_update(struct task_struct *p) { - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + if (test_syscall_work(SYSCALL_TRACEPOINT)) + set_task_syscall_work(p, SYSCALL_TRACEPOINT); else - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + clear_task_syscall_work(p, SYSCALL_TRACEPOINT); } #else static inline void syscall_tracepoint_update(struct task_struct *p) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5747a6eb2c48..f651967847ec 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -63,7 +63,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); @@ -233,7 +233,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, audit_syscall_exit(regs); - if (ti_work & _TIF_SYSCALL_TRACEPOINT) + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 47a71f96e5bc..adf65b502453 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3428,10 +3428,10 @@ static __init int event_trace_enable(void) * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need - * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() - * is called before pid 1 starts, and this flag is never set, making - * the syscall tracepoint never get reached, but the event is enabled - * regardless (and not doing anything). + * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But + * event_trace_enable() is called before pid 1 starts, and this flag + * is never set, making the syscall tracepoint never get reached, but + * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3f659f855074..7261fa0f5e3c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -594,7 +594,7 @@ int syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } @@ -611,7 +611,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } -- cgit v1.2.3 From 64c19ba29b66e98af9306b4a7525fb22c895d252 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:02 -0500 Subject: ptrace: Migrate to use SYSCALL_TRACE flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACE, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-7-krisman@collabora.com --- include/asm-generic/syscall.h | 15 ++++++++------- include/linux/entry-common.h | 10 ++++++---- include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 17 +++++++++-------- kernel/entry/common.c | 4 ++-- kernel/fork.c | 2 +- kernel/ptrace.c | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 524d8e68ff5e..ed94e5658d0c 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,7 +43,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), * after tracehook_report_syscall_entry() returned nonzero to prevent * the system call from taking place. * @@ -63,7 +63,7 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +76,7 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +93,7 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +108,7 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +123,7 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +135,8 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2a01eee2dbba..ae426ab9c372 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -41,7 +41,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -53,12 +53,14 @@ #endif #define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ ARCH_SYSCALL_EXIT_WORK) #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ - SYSCALL_WORK_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) + SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c232043c12d3..761a4590d554 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -38,10 +38,12 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, + SYSCALL_WORK_BIT_SYSCALL_TRACE, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..3f20368afe9e 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -83,11 +83,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * - * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set, - * when the current task has just entered the kernel for a system call. - * Full user register state is available here. Changing the values - * in @regs can affect the system call number and arguments to be tried. - * It is safe to block here, preventing the system call from beginning. + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or + * %TIF_SYSCALL_EMU have been set, when the current task has just + * entered the kernel for a system call. Full user register state is + * available here. Changing the values in @regs can affect the system + * call number and arguments to be tried. It is safe to block here, + * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is @@ -109,15 +110,15 @@ static inline __must_check int tracehook_report_syscall_entry( * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * - * This will be called if %TIF_SYSCALL_TRACE has been set, when the - * current task has just finished an attempted system call. Full + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when + * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. - * In this case, %TIF_SYSCALL_TRACE might not be set. + * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f651967847ec..917328a9edaa 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { ret = arch_syscall_enter_tracehook(regs); if (ret || (ti_work & _TIF_SYSCALL_EMU)) return -1L; @@ -237,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); - if (step || ti_work & _TIF_SYSCALL_TRACE) + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index bc5b1090f415..99f68c20f2ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2158,7 +2158,7 @@ static __latent_entropy struct task_struct *copy_process( * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + clear_task_syscall_work(p, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..55a2bc3186a7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) const struct cred *old_cred; BUG_ON(!child->ptrace); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif @@ -812,9 +812,9 @@ static int ptrace_resume(struct task_struct *child, long request, return -EIO; if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_task_syscall_work(child, SYSCALL_TRACE); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) -- cgit v1.2.3 From 64eb35f701f04b30706e21d1b02636b5d31a37d2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:03 -0500 Subject: ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_EMU, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-8-krisman@collabora.com --- include/linux/entry-common.h | 8 ++------ include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 2 +- kernel/entry/common.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 +++++----- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ae426ab9c372..b30f82bed92b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_EMU -# define _TIF_SYSCALL_EMU (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -42,7 +38,6 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,7 +53,8 @@ #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_EMU) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 761a4590d554..85b8a4216168 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -39,11 +39,13 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, + SYSCALL_WORK_BIT_SYSCALL_EMU, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3f20368afe9e..54b925224a13 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -84,7 +84,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or - * %TIF_SYSCALL_EMU have been set, when the current task has just + * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 917328a9edaa..90533f34ea99 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,9 +47,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); - if (ret || (ti_work & _TIF_SYSCALL_EMU)) + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -208,21 +208,22 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) } #ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { return false; } #else /* - * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * If SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ -#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) - -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { - return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; + if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + return false; + + return !!(current_thread_info()->flags & _TIF_SINGLESTEP); } #endif @@ -236,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); - step = report_single_step(ti_work); + step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index 99f68c20f2ff..02b689a23457 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2159,8 +2159,8 @@ static __latent_entropy struct task_struct *copy_process( */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 55a2bc3186a7..237bcd6d255c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,8 +118,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; @@ -816,11 +816,11 @@ static int ptrace_resume(struct task_struct *child, long request, else clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + set_task_syscall_work(child, SYSCALL_EMU); else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { -- cgit v1.2.3 From 785dc4eb7fd74e3b7f4eac468457b633117e1aea Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:04 -0500 Subject: audit: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_AUDIT, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-9-krisman@collabora.com --- include/asm-generic/syscall.h | 23 ++++++++++++++--------- include/linux/entry-common.h | 18 ++++++------------ include/linux/thread_info.h | 2 ++ kernel/auditsc.c | 4 ++-- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index ed94e5658d0c..524218ae3825 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,9 +43,9 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), - * after tracehook_report_syscall_entry() returned nonzero to prevent - * the system call from taking place. + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT), after tracehook_report_syscall_entry() + * returned nonzero to prevent the system call from taking place. * * This rolls back the register state in @regs so it's as if the * system call instruction was a no-op. The registers containing @@ -63,7 +63,8 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +77,8 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +95,8 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +111,8 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +127,8 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +140,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %SYSCALL_WORK_SYSCALL_AUDIT, or * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b30f82bed92b..d7b96f42817f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_AUDIT -# define _TIF_SYSCALL_AUDIT (0) -#endif - #ifndef _TIF_PATCH_PENDING # define _TIF_PATCH_PENDING (0) #endif @@ -36,9 +32,7 @@ # define ARCH_SYSCALL_ENTER_WORK (0) #endif -#define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_AUDIT | \ - ARCH_SYSCALL_ENTER_WORK) +#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK /* * TIF flags handled in syscall_exit_to_user_mode() @@ -47,16 +41,16 @@ # define ARCH_SYSCALL_EXIT_WORK (0) #endif -#define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_AUDIT | \ - ARCH_SYSCALL_EXIT_WORK) +#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ - SYSCALL_WORK_SYSCALL_EMU) + SYSCALL_WORK_SYSCALL_EMU | \ + SYSCALL_WORK_SYSCALL_AUDIT) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_AUDIT) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 85b8a4216168..317363212ae9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -40,12 +40,14 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, + SYSCALL_WORK_BIT_SYSCALL_AUDIT, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) +#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dba8f0983b5..c00aa5837965 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk) state = audit_filter_task(tsk, &key); if (state == AUDIT_DISABLED) { - clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } @@ -964,7 +964,7 @@ int audit_alloc(struct task_struct *tsk) context->filterkey = key; audit_set_context(tsk, context); - set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } -- cgit v1.2.3 From 2991552447707d791d9d81a5dc161f9e9e90b163 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:05 -0500 Subject: entry: Drop usage of TIF flags in the generic syscall code Now that the flags migration in the common syscall entry code is complete and the code relies exclusively on thread_info::syscall_work, clean up the accesses to TI flags in that path. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-10-krisman@collabora.com --- include/linux/entry-common.h | 26 ++++++++++++-------------- kernel/entry/common.c | 17 +++++++---------- 2 files changed, 19 insertions(+), 24 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d7b96f42817f..49b26b216e4e 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -26,31 +26,29 @@ #endif /* - * TIF flags handled in syscall_enter_from_user_mode() + * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() */ -#ifndef ARCH_SYSCALL_ENTER_WORK -# define ARCH_SYSCALL_ENTER_WORK (0) +#ifndef ARCH_SYSCALL_WORK_ENTER +# define ARCH_SYSCALL_WORK_ENTER (0) #endif -#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK - /* - * TIF flags handled in syscall_exit_to_user_mode() + * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() */ -#ifndef ARCH_SYSCALL_EXIT_WORK -# define ARCH_SYSCALL_EXIT_WORK (0) +#ifndef ARCH_SYSCALL_WORK_EXIT +# define ARCH_SYSCALL_WORK_EXIT (0) #endif -#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK - #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ - SYSCALL_WORK_SYSCALL_AUDIT) + SYSCALL_WORK_SYSCALL_AUDIT | \ + ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ - SYSCALL_WORK_SYSCALL_AUDIT) + SYSCALL_WORK_SYSCALL_AUDIT | \ + ARCH_SYSCALL_WORK_EXIT) /* * TIF flags handled in exit_to_user_mode_loop() @@ -136,8 +134,8 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); * * It handles the following work items: * - * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(), - * __secure_computing(), trace_sys_enter() + * 1) syscall_work flag dependent invocations of + * arch_syscall_enter_tracehook(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90533f34ea99..91e8fd50adf4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long ti_work, unsigned long work) + unsigned long work) { long ret = 0; @@ -75,11 +75,9 @@ static __always_inline long __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long ti_work; - ti_work = READ_ONCE(current_thread_info()->flags); - if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK) - syscall = syscall_trace_enter(regs, syscall, ti_work, work); + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); return syscall; } @@ -227,8 +225,8 @@ static inline bool report_single_step(unsigned long work) } #endif -static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, - unsigned long work) + +static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; @@ -249,7 +247,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - u32 cached_flags = READ_ONCE(current_thread_info()->flags); unsigned long nr = syscall_get_nr(current, regs); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -266,8 +263,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ - if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK)) - syscall_exit_work(regs, cached_flags, work); + if (unlikely(work & SYSCALL_WORK_EXIT)) + syscall_exit_work(regs, work); } __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From 51af3f23063946344330a77a7d1dece6fc6bb5d8 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:06 -0500 Subject: x86: Reclaim unused x86 TI flags Reclaim TI flags that were migrated to syscall_work flags. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-11-krisman@collabora.com --- arch/x86/include/asm/thread_info.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0da5d58d7c79..0d751d5da702 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -75,15 +75,11 @@ struct thread_info { * - these are process state flags that various assembly files * may need to access */ -#define TIF_SYSCALL_TRACE 0 /* syscall trace active */ #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ #define TIF_SSBD 5 /* Speculative store bypass disable */ -#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ -#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ -#define TIF_SECCOMP 8 /* secure computing */ #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ #define TIF_SPEC_FORCE_UPDATE 10 /* Force speculation MSR update in context switch */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ @@ -100,18 +96,13 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ -#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ -#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_SSBD (1 << TIF_SSBD) -#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) -#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) -#define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) @@ -127,7 +118,6 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) -#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) /* flags to check in __switch_to() */ -- cgit v1.2.3 From 83c2da2e605c73aafcc02df04b2dbf1ccbfc24c0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:33 +0100 Subject: context_tracking: Introduce HAVE_CONTEXT_TRACKING_OFFSTACK Historically, context tracking had to deal with fragile entry code path, ie: before user_exit() is called and after user_enter() is called, in case some of those spots would call schedule() or use RCU. On such cases, the site had to be protected between exception_enter() and exception_exit() that save the context tracking state in the task stack. Such sleepable fragile code path had many different origins: tracing, exceptions, early or late calls to context tracking on syscalls... Aside of that not being pretty, saving the context tracking state on the task stack forces us to run context tracking on all CPUs, including housekeepers, and prevents us to completely shutdown nohz_full at runtime on a CPU in the future as context tracking and its overhead would still need to run system wide. Now thanks to the extensive efforts to sanitize x86 entry code, those conditions have been removed and we can now get rid of these workarounds in this architecture. Create a Kconfig feature to express this achievement. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-2-frederic@kernel.org --- arch/Kconfig | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..090ef3566c56 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -618,6 +618,23 @@ config HAVE_CONTEXT_TRACKING protected inside rcu_irq_enter/rcu_irq_exit() but preemption or signal handling on irq exit still need to be protected. +config HAVE_CONTEXT_TRACKING_OFFSTACK + bool + help + Architecture neither relies on exception_enter()/exception_exit() + nor on schedule_user(). Also preempt_schedule_notrace() and + preempt_schedule_irq() can't be called in a preemptible section + while context tracking is CONTEXT_USER. This feature reflects a sane + entry implementation where the following requirements are met on + critical entry code, ie: before user_exit() or after user_enter(): + + - Critical entry code isn't preemptible (or better yet: + not interruptible). + - No use of RCU read side critical sections, unless rcu_nmi_enter() + got called. + - No use of instrumentation, unless instrumentation_begin() got + called. + config HAVE_TIF_NOHZ bool help -- cgit v1.2.3 From 179a9cf79212bb3b96fb69a314583189cd863c5b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:34 +0100 Subject: context_tracking: Don't implement exception_enter/exit() on CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK The typical steps with context tracking are: 1) Task runs in userspace 2) Task enters the kernel (syscall/exception/IRQ) 3) Task switches from context tracking state CONTEXT_USER to CONTEXT_KERNEL (user_exit()) 4) Task does stuff in kernel 5) Task switches from context tracking state CONTEXT_KERNEL to CONTEXT_USER (user_enter()) 6) Task exits the kernel If an exception fires between 5) and 6), the pt_regs and the context tracking disagree on the context of the faulted/trapped instruction. CONTEXT_KERNEL must be set before the exception handler, that's unconditional for those handlers that want to be able to call into schedule(), but CONTEXT_USER must be restored when the exception exits whereas pt_regs tells that we are resuming to kernel space. This can't be fixed with storing the context tracking state in a per-cpu or per-task variable since another exception may fire onto the current one and overwrite the saved state. Also the task can schedule. So it has to be stored in a per task stack. This is how exception_enter()/exception_exit() paper over the problem: 5) Task switches from context tracking state CONTEXT_KERNEL to CONTEXT_USER (user_enter()) 5.1) Exception fires 5.2) prev_state = exception_enter() // save CONTEXT_USER to prev_state // and set CONTEXT_KERNEL 5.3) Exception handler 5.4) exception_enter(prev_state) // restore CONTEXT_USER 5.5) Exception resumes 6) Task exits the kernel The condition to live without exception_enter()/exception_exit() is to forbid exceptions and IRQs between 2) and 3) and between 5) and 6), or if any is allowed to trigger, it won't call into context tracking, eg: NMIs, and it won't schedule. These requirements are met by architectures supporting CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK and those can therefore afford not to implement this hack. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-3-frederic@kernel.org --- include/linux/context_tracking.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d53cd331c4dd..bceb06498521 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -51,7 +51,8 @@ static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; - if (!context_tracking_enabled()) + if (IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) || + !context_tracking_enabled()) return 0; prev_ctx = this_cpu_read(context_tracking.state); @@ -63,7 +64,8 @@ static inline enum ctx_state exception_enter(void) static inline void exception_exit(enum ctx_state prev_ctx) { - if (context_tracking_enabled()) { + if (!IS_ENABLED(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) && + context_tracking_enabled()) { if (prev_ctx != CONTEXT_KERNEL) context_tracking_enter(prev_ctx); } -- cgit v1.2.3 From 9f68b5b74c48761bcbd7d90cf1426049bdbaabb7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:35 +0100 Subject: sched: Detect call to schedule from critical entry code Detect calls to schedule() between user_enter() and user_exit(). Those are symptoms of early entry code that either forgot to protect a call to schedule() inside exception_enter()/exception_exit() or, in the case of HAVE_CONTEXT_TRACKING_OFFSTACK, enabled interrupts or preemption in a wrong spot. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-4-frederic@kernel.org --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..c23d7cb5aee3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4291,6 +4291,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit v1.2.3 From 6775de4984ea83ce39f19a40c09f8813d7423831 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:36 +0100 Subject: context_tracking: Only define schedule_user() on !HAVE_CONTEXT_TRACKING_OFFSTACK archs schedule_user() was traditionally used by the entry code's tail to preempt userspace after the call to user_enter(). Indeed the call to user_enter() used to be performed upon syscall exit slow path which was right before the last opportunity to schedule() while resuming to userspace. The context tracking state had to be saved on the task stack and set back to CONTEXT_KERNEL temporarily in order to safely switch to another task. Only a few archs use it now (namely sparc64 and powerpc64) and those implementing HAVE_CONTEXT_TRACKING_OFFSTACK definetly can't rely on it. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-5-frederic@kernel.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c23d7cb5aee3..44426e5acde2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4631,7 +4631,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* -- cgit v1.2.3 From d1f250e2205eca9f1264f8e2d3a41fcf38f92d91 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:37 +0100 Subject: x86: Support HAVE_CONTEXT_TRACKING_OFFSTACK A lot of ground work has been performed on x86 entry code. Fragile path between user_enter() and user_exit() have IRQs disabled. Uses of RCU and intrumentation in these fragile areas have been explicitly annotated and protected. This architecture doesn't need exception_enter()/exception_exit() anymore and has therefore earned CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-6-frederic@kernel.org --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..d793361839b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -162,6 +162,7 @@ config X86 select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL select HAVE_CONTEXT_TRACKING if X86_64 + select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS -- cgit v1.2.3 From 5903f61e035320104394f721f74cd22171636f63 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 23 Nov 2020 10:54:58 -0500 Subject: entry: Fix boot for !CONFIG_GENERIC_ENTRY A copy-pasta mistake tries to set SYSCALL_WORK flags instead of TIF flags for !CONFIG_GENERIC_ENTRY. Also, add safeguards to catch this at compilation time. Fixes: 3136b93c3fb2 ("entry: Expose helpers to migrate TIF to SYSCALL_WORK flags") Reported-by: Naresh Kamboju Suggested-by: Jann Horn Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Link: https://lore.kernel.org/r/87a6v8qd9p.fsf_-_@collabora.com --- include/linux/thread_info.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 317363212ae9..ca80a214df09 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -35,6 +35,7 @@ enum { GOOD_STACK, }; +#ifdef CONFIG_GENERIC_ENTRY enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, @@ -48,6 +49,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#endif #include @@ -129,11 +131,11 @@ static inline int test_ti_thread_flag(struct thread_info *ti, int flag) #else /* CONFIG_GENERIC_ENTRY */ #define set_syscall_work(fl) \ - set_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) + set_ti_thread_flag(current_thread_info(), TIF_##fl) #define test_syscall_work(fl) \ - test_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) + test_ti_thread_flag(current_thread_info(), TIF_##fl) #define clear_syscall_work(fl) \ - clear_ti_thread_flag(current_thread_info(), SYSCALL_WORK_##fl) + clear_ti_thread_flag(current_thread_info(), TIF_##fl) #define set_task_syscall_work(t, fl) \ set_ti_thread_flag(task_thread_info(t), TIF_##fl) -- cgit v1.2.3 From 01fe185d95ba3cdd6629859dd911a94de8800562 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 2 Dec 2020 01:34:40 +0100 Subject: MAINTAINERS: Add entry for common entry code Signed-off-by: Thomas Gleixner --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b516bb34a8d5..a35248b05fa2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7333,6 +7333,17 @@ S: Maintained F: drivers/base/arch_topology.c F: include/linux/arch_topology.h +GENERIC ENTRY CODE +M: Thomas Gleixner +M: Peter Zijlstra +M: Andy Lutomirski +L: linux-kernel@vger.kernel.org +S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git core/entry +F: include/linux/entry-common.h +F: include/linux/entry-kvm.h +F: kernel/entry/ + GENERIC GPIO I2C DRIVER M: Wolfram Sang S: Supported -- cgit v1.2.3 From c5c878125ad5aca199dfc10b1af4010165aaa596 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:32 -0500 Subject: x86: vdso: Expose sigreturn address on vdso to the kernel Syscall user redirection requires the signal trampoline code to not be captured, in order to support returning with a locked selector while avoiding recursion back into the signal handler. For ia-32, which has the trampoline in the vDSO, expose the entry points to the kernel, such that it can avoid dispatching syscalls from that region to userspace. Suggested-by: Andy Lutomirski Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201127193238.821364-2-krisman@collabora.com --- arch/x86/entry/vdso/vdso2c.c | 2 ++ arch/x86/entry/vdso/vdso32/sigreturn.S | 2 ++ arch/x86/entry/vdso/vma.c | 15 +++++++++++++++ arch/x86/include/asm/elf.h | 2 ++ arch/x86/include/asm/vdso.h | 2 ++ 5 files changed, 23 insertions(+) diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 7380908045c7..2d0f3d8bcc25 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,8 @@ struct vdso_sym required_syms[] = { {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, {"int80_landing_pad", true}, + {"vdso32_rt_sigreturn_landing_pad", true}, + {"vdso32_sigreturn_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index c3233ee98a6b..1bd068f72d4c 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -18,6 +18,7 @@ __kernel_sigreturn: movl $__NR_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_sigreturn: +SYM_INNER_LABEL(vdso32_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_sigreturn,.-.LSTART_sigreturn @@ -29,6 +30,7 @@ __kernel_rt_sigreturn: movl $__NR_rt_sigreturn, %eax SYSCALL_ENTER_KERNEL .LEND_rt_sigreturn: +SYM_INNER_LABEL(vdso32_rt_sigreturn_landing_pad, SYM_L_GLOBAL) nop .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 50e5d3a2e70a..de60cd37070b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -436,6 +436,21 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) } #endif +bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) +{ +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) + const struct vdso_image *image = current->mm->context.vdso_image; + unsigned long vdso = (unsigned long) current->mm->context.vdso; + + if (in_ia32_syscall() && image == &vdso_image_32) { + if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || + regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) + return true; + } +#endif + return false; +} + #ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 44a9b9940535..66bdfe838d61 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -388,6 +388,8 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, compat_arch_setup_additional_pages(bprm, interpreter, \ (ex->e_machine == EM_X86_64)) +extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs); + /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bbcdc7b8f963..589f489dd375 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -27,6 +27,8 @@ struct vdso_image { long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; long sym_int80_landing_pad; + long sym_vdso32_sigreturn_landing_pad; + long sym_vdso32_rt_sigreturn_landing_pad; }; #ifdef CONFIG_X86_64 -- cgit v1.2.3 From 1d7637d89cfce54a4f4a41c2325288c2f47470e8 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:33 -0500 Subject: signal: Expose SYS_USER_DISPATCH si_code type SYS_USER_DISPATCH will be triggered when a syscall is sent to userspace by the Syscall User Dispatch mechanism. This adjusts eventual BUILD_BUG_ON around the tree. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Kees Cook Acked-by: Christian Brauner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201127193238.821364-3-krisman@collabora.com --- arch/x86/kernel/signal_compat.c | 2 +- include/uapi/asm-generic/siginfo.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a7f3e12cfbdb..d7b51870f16b 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -31,7 +31,7 @@ static inline void signal_compat_build_tests(void) BUILD_BUG_ON(NSIGBUS != 5); BUILD_BUG_ON(NSIGTRAP != 5); BUILD_BUG_ON(NSIGCHLD != 6); - BUILD_BUG_ON(NSIGSYS != 1); + BUILD_BUG_ON(NSIGSYS != 2); /* This is part of the ABI and can never change in size: */ BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 7aacf9389010..d2597000407a 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -286,7 +286,8 @@ typedef struct siginfo { * SIGSYS si_codes */ #define SYS_SECCOMP 1 /* seccomp triggered */ -#define NSIGSYS 1 +#define SYS_USER_DISPATCH 2 /* syscall user dispatch triggered */ +#define NSIGSYS 2 /* * SIGEMT si_codes -- cgit v1.2.3 From 1446e1df9eb183fdf81c3f0715402f1d7595d4cb Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:34 -0500 Subject: kernel: Implement selective syscall userspace redirection Introduce a mechanism to quickly disable/enable syscall handling for a specific process and redirect to userspace via SIGSYS. This is useful for processes with parts that require syscall redirection and parts that don't, but who need to perform this boundary crossing really fast, without paying the cost of a system call to reconfigure syscall handling on each boundary transition. This is particularly important for Windows games running over Wine. The proposed interface looks like this: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) The range [,+) is a part of the process memory map that is allowed to by-pass the redirection code and dispatch syscalls directly, such that in fast paths a process doesn't need to disable the trap nor the kernel has to check the selector. This is essential to return from SIGSYS to a blocked area without triggering another SIGSYS from rt_sigreturn. selector is an optional pointer to a char-sized userspace memory region that has a key switch for the mechanism. This key switch is set to either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the redirection without calling the kernel. The feature is meant to be set per-thread and it is disabled on fork/clone/execv. Internally, this doesn't add overhead to the syscall hot path, and it requires very little per-architecture support. I avoided using seccomp, even though it duplicates some functionality, due to previous feedback that maybe it shouldn't mix with seccomp since it is not a security mechanism. And obviously, this should never be considered a security mechanism, since any part of the program can by-pass it by using the syscall dispatcher. For the sysinfo benchmark, which measures the overhead added to executing a native syscall that doesn't require interception, the overhead using only the direct dispatcher region to issue syscalls is pretty much irrelevant. The overhead of using the selector goes around 40ns for a native (unredirected) syscall in my system, and it is (as expected) dominated by the supervisor-mode user-address access. In fact, with SMAP off, the overhead is consistently less than 5ns on my test box. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com --- fs/exec.c | 3 + include/linux/sched.h | 2 + include/linux/syscall_user_dispatch.h | 40 +++++++++++++ include/linux/thread_info.h | 2 + include/uapi/linux/prctl.h | 5 ++ kernel/entry/Makefile | 2 +- kernel/entry/common.h | 7 +++ kernel/entry/syscall_user_dispatch.c | 104 ++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + kernel/sys.c | 5 ++ 10 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 include/linux/syscall_user_dispatch.h create mode 100644 kernel/entry/common.h create mode 100644 kernel/entry/syscall_user_dispatch.c diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..aee36e5733ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm) flush_thread(); me->personality &= ~bprm->per_clear; + clear_syscall_work_syscall_user_dispatch(me); + /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..5a24a033b3f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -965,6 +966,7 @@ struct task_struct { unsigned int sessionid; #endif struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h new file mode 100644 index 000000000000..a0ae443fb7df --- /dev/null +++ b/include/linux/syscall_user_dispatch.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#ifndef _SYSCALL_USER_DISPATCH_H +#define _SYSCALL_USER_DISPATCH_H + +#include + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector); + +#define clear_syscall_work_syscall_user_dispatch(tsk) \ + clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH) + +#else +struct syscall_user_dispatch {}; + +static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return -EINVAL; +} + +static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_GENERIC_ENTRY */ + +#endif /* _SYSCALL_USER_DISPATCH_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ca80a214df09..c8a974cead73 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,6 +42,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, + SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -49,6 +50,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #endif #include diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a..90deb41c8a34 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,4 +247,9 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 34c8a3f1c735..095c775e001e 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.h b/kernel/entry/common.h new file mode 100644 index 000000000000..f6e6d02f07fe --- /dev/null +++ b/kernel/entry/common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COMMON_H +#define _COMMON_H + +bool syscall_user_dispatch(struct pt_regs *regs); + +#endif diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c new file mode 100644 index 000000000000..b0338a5625d9 --- /dev/null +++ b/kernel/entry/syscall_user_dispatch.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "common.h" + +static void trigger_sigsys(struct pt_regs *regs) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_USER_DISPATCH; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = 0; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall_get_nr(current, regs); + + force_sig_info(&info); +} + +bool syscall_user_dispatch(struct pt_regs *regs) +{ + struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; + char state; + + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) + return false; + + if (unlikely(arch_syscall_is_vdso_sigreturn(regs))) + return false; + + if (likely(sd->selector)) { + /* + * access_ok() is performed once, at prctl time, when + * the selector is loaded by userspace. + */ + if (unlikely(__get_user(state, sd->selector))) + do_exit(SIGSEGV); + + if (likely(state == PR_SYS_DISPATCH_OFF)) + return false; + + if (state != PR_SYS_DISPATCH_ON) + do_exit(SIGSYS); + } + + sd->on_dispatch = true; + syscall_rollback(current, regs); + trigger_sigsys(regs); + + return true; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + switch (mode) { + case PR_SYS_DISPATCH_OFF: + if (offset || len || selector) + return -EINVAL; + break; + case PR_SYS_DISPATCH_ON: + /* + * Validate the direct dispatcher region just for basic + * sanity against overflow and a 0-sized dispatcher + * region. If the user is able to submit a syscall from + * an address, that address is obviously valid. + */ + if (offset && offset + len <= offset) + return -EINVAL; + + if (selector && !access_ok(selector, sizeof(*selector))) + return -EFAULT; + + break; + default: + return -EINVAL; + } + + current->syscall_dispatch.selector = selector; + current->syscall_dispatch.offset = offset; + current->syscall_dispatch.len = len; + current->syscall_dispatch.on_dispatch = false; + + if (mode == PR_SYS_DISPATCH_ON) + set_syscall_work(SYSCALL_USER_DISPATCH); + else + clear_syscall_work(SYSCALL_USER_DISPATCH); + + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 02b689a23457..4a5ecb41f440 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); + clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); diff --git a/kernel/sys.c b/kernel/sys.c index a730c03ee607..51f00fe20e4d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_SYSCALL_USER_DISPATCH: + error = set_syscall_user_dispatch(arg2, arg3, arg4, + (char __user *) arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 11894468e39def270199f845b76df6c36d4ed133 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:35 -0500 Subject: entry: Support Syscall User Dispatch on common syscall entry Syscall User Dispatch (SUD) must take precedence over seccomp and ptrace, since the use case is emulation (it can be invoked with a different ABI) such that seccomp filtering by syscall number doesn't make sense in the first place. In addition, either the syscall is dispatched back to userspace, in which case there is no resource for to trace, or the syscall will be executed, and seccomp/ptrace will execute next. Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as well, just to prevent a trace exit event when dispatch was triggered. For that, the on_syscall_dispatch() examines context to skip the tracepoint, audit and other work. [ tglx: Add a comment on the exit side ] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-5-krisman@collabora.com --- include/linux/entry-common.h | 2 ++ kernel/entry/common.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 49b26b216e4e..a6e98b4ba8e9 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -44,10 +44,12 @@ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ + SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ + SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_EXIT) /* diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 91e8fd50adf4..e661e70ffcf3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,8 @@ #include #include +#include "common.h" + #define CREATE_TRACE_POINTS #include @@ -46,6 +48,16 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, { long ret = 0; + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); @@ -230,6 +242,19 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + audit_syscall_exit(regs); if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) -- cgit v1.2.3 From 179ef035992e89646e17138b18b130bb874b86bb Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:36 -0500 Subject: selftests: Add kselftest for syscall user dispatch Implement functionality tests for syscall user dispatch. In order to make the test portable, refrain from open coding syscall dispatchers and calculating glibc memory ranges. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Kees Cook Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201127193238.821364-6-krisman@collabora.com --- tools/testing/selftests/Makefile | 1 + .../selftests/syscall_user_dispatch/.gitignore | 3 + .../selftests/syscall_user_dispatch/Makefile | 9 + .../testing/selftests/syscall_user_dispatch/config | 1 + .../selftests/syscall_user_dispatch/sud_test.c | 310 +++++++++++++++++++++ 5 files changed, 324 insertions(+) create mode 100644 tools/testing/selftests/syscall_user_dispatch/.gitignore create mode 100644 tools/testing/selftests/syscall_user_dispatch/Makefile create mode 100644 tools/testing/selftests/syscall_user_dispatch/config create mode 100644 tools/testing/selftests/syscall_user_dispatch/sud_test.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index d9c283503159..96d5682fb9d8 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -56,6 +56,7 @@ TARGETS += sparc64 TARGETS += splice TARGETS += static_keys TARGETS += sync +TARGETS += syscall_user_dispatch TARGETS += sysctl TARGETS += tc-testing TARGETS += timens diff --git a/tools/testing/selftests/syscall_user_dispatch/.gitignore b/tools/testing/selftests/syscall_user_dispatch/.gitignore new file mode 100644 index 000000000000..f539615ad5da --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/.gitignore @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only +sud_test +sud_benchmark diff --git a/tools/testing/selftests/syscall_user_dispatch/Makefile b/tools/testing/selftests/syscall_user_dispatch/Makefile new file mode 100644 index 000000000000..8e15fa42bcda --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/Makefile @@ -0,0 +1,9 @@ +# SPDX-License-Identifier: GPL-2.0 +top_srcdir = ../../../.. +INSTALL_HDR_PATH = $(top_srcdir)/usr +LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ + +CFLAGS += -Wall -I$(LINUX_HDR_PATH) + +TEST_GEN_PROGS := sud_test +include ../lib.mk diff --git a/tools/testing/selftests/syscall_user_dispatch/config b/tools/testing/selftests/syscall_user_dispatch/config new file mode 100644 index 000000000000..039e303e59d7 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/config @@ -0,0 +1 @@ +CONFIG_GENERIC_ENTRY=y diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c new file mode 100644 index 000000000000..6498b050ef89 --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Test code for syscall user dispatch + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include +#include "../kselftest_harness.h" + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifndef SYS_USER_DISPATCH +# define SYS_USER_DISPATCH 2 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +#define SYSCALL_DISPATCH_ON(x) ((x) = 1) +#define SYSCALL_DISPATCH_OFF(x) ((x) = 0) + +/* Test Summary: + * + * - dispatch_trigger_sigsys: Verify if PR_SET_SYSCALL_USER_DISPATCH is + * able to trigger SIGSYS on a syscall. + * + * - bad_selector: Test that a bad selector value triggers SIGSYS with + * si_errno EINVAL. + * + * - bad_prctl_param: Test that the API correctly rejects invalid + * parameters on prctl + * + * - dispatch_and_return: Test that a syscall is selectively dispatched + * to userspace depending on the value of selector. + * + * - disable_dispatch: Test that the PR_SYS_DISPATCH_OFF correctly + * disables the dispatcher + * + * - direct_dispatch_range: Test that a syscall within the allowed range + * can bypass the dispatcher. + */ + +TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) +{ + char sel = 0; + struct sysinfo info; + int ret; + + ret = sysinfo(&info); + ASSERT_EQ(0, ret); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + sysinfo(&info); + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(bad_prctl_param) +{ + char sel = 0; + int op; + + /* Invalid op */ + op = -1; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0, 0, &sel); + ASSERT_EQ(EINVAL, errno); + + /* PR_SYS_DISPATCH_OFF */ + op = PR_SYS_DISPATCH_OFF; + + /* offset != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, 0); + EXPECT_EQ(EINVAL, errno); + + /* len != 0 */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0xff, 0); + EXPECT_EQ(EINVAL, errno); + + /* sel != NULL */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Valid parameter */ + errno = 0; + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x0, 0x0); + EXPECT_EQ(0, errno); + + /* PR_SYS_DISPATCH_ON */ + op = PR_SYS_DISPATCH_ON; + + /* Dispatcher region is bad (offset > 0 && len == 0) */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x1, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, -1L, 0x0, &sel); + EXPECT_EQ(EINVAL, errno); + + /* Invalid selector */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, op, 0x0, 0x1, (void *) -1); + ASSERT_EQ(EFAULT, errno); + + /* + * Dispatcher range overflows unsigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 1, -1L, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } + + /* + * Allowed range overflows usigned long + */ + prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, -1L, 0x1, &sel); + ASSERT_EQ(EINVAL, errno) { + TH_LOG("Should reject bad syscall range"); + } +} + +/* + * Use global selector for handle_sigsys tests, to avoid passing + * selector to signal handler + */ +char glob_sel; +int nr_syscalls_emulated; +int si_code; +int si_errno; + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + si_code = info->si_code; + si_errno = info->si_errno; + + if (info->si_syscall == MAGIC_SYSCALL_1) + nr_syscalls_emulated++; + + /* In preparation for sigreturn. */ + SYSCALL_DISPATCH_OFF(glob_sel); +} + +TEST(dispatch_and_return) +{ + long ret; + struct sigaction act; + sigset_t mask; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(-1, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } + + /* MAGIC_SYSCALL_1 should be emulated. */ + nr_syscalls_emulated = 0; + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(MAGIC_SYSCALL_1); + EXPECT_EQ(MAGIC_SYSCALL_1, ret) { + TH_LOG("Failed to intercept syscall"); + } + EXPECT_EQ(1, nr_syscalls_emulated) { + TH_LOG("Failed to emulate syscall"); + } + ASSERT_EQ(SYS_USER_DISPATCH, si_code) { + TH_LOG("Bad si_code in SIGSYS"); + } + ASSERT_EQ(0, si_errno) { + TH_LOG("Bad si_errno in SIGSYS"); + } +} + +TEST_SIGNAL(bad_selector, SIGSYS) +{ + long ret; + struct sigaction act; + sigset_t mask; + struct sysinfo info; + + glob_sel = 0; + nr_syscalls_emulated = 0; + si_code = 0; + si_errno = 0; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + ret = sigaction(SIGSYS, &act, NULL); + ASSERT_EQ(0, ret); + + /* Make sure selector is good prior to prctl. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &glob_sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + glob_sel = -1; + + sysinfo(&info); + + /* Even though it is ready to catch SIGSYS, the signal is + * supposed to be uncatchable. + */ + + EXPECT_FALSE(true) { + TH_LOG("Unreachable!"); + } +} + +TEST(disable_dispatch) +{ + int ret; + struct sysinfo info; + char sel = 0; + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, 0, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + /* MAGIC_SYSCALL_1 doesn't exist. */ + SYSCALL_DISPATCH_OFF(glob_sel); + + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_OFF, 0, 0, 0); + EXPECT_EQ(0, ret) { + TH_LOG("Failed to unset syscall user dispatch"); + } + + /* Shouldn't have any effect... */ + SYSCALL_DISPATCH_ON(glob_sel); + + ret = syscall(__NR_sysinfo, &info); + EXPECT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST(direct_dispatch_range) +{ + int ret = 0; + struct sysinfo info; + char sel = 0; + + /* + * Instead of calculating libc addresses; allow the entire + * memory map and lock the selector. + */ + ret = prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, 0, -1L, &sel); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support CONFIG_SYSCALL_USER_DISPATCH"); + } + + SYSCALL_DISPATCH_ON(sel); + + ret = sysinfo(&info); + ASSERT_EQ(0, ret) { + TH_LOG("Dispatch triggered unexpectedly"); + } +} + +TEST_HARNESS_MAIN -- cgit v1.2.3 From d87ae0fa21c26db2d7c66f22dee9c27ecda48ce2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:37 -0500 Subject: selftests: Add benchmark for syscall user dispatch This is the patch I'm using to evaluate the impact syscall user dispatch has on native syscall (syscalls not redirected to userspace) when enabled for the process and submiting syscalls though the unblocked dispatch selector. It works by running a step to define a baseline of the cost of executing sysinfo, then enabling SUD, and rerunning that step. On my test machine, an AMD Ryzen 5 1500X, I have the following results with the latest version of syscall user dispatch patches. root@olga:~# syscall_user_dispatch/sud_benchmark Calibrating test set to last ~5 seconds... test iterations = 37500000 Avg syscall time 134ns. Caught sys_ff00 trapped_call_count 1, native_call_count 0. Avg syscall time 147ns. Interception overhead: 9.7% (+13ns). Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201127193238.821364-7-krisman@collabora.com --- .../selftests/syscall_user_dispatch/Makefile | 2 +- .../syscall_user_dispatch/sud_benchmark.c | 200 +++++++++++++++++++++ 2 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c diff --git a/tools/testing/selftests/syscall_user_dispatch/Makefile b/tools/testing/selftests/syscall_user_dispatch/Makefile index 8e15fa42bcda..03c120270953 100644 --- a/tools/testing/selftests/syscall_user_dispatch/Makefile +++ b/tools/testing/selftests/syscall_user_dispatch/Makefile @@ -5,5 +5,5 @@ LINUX_HDR_PATH = $(INSTALL_HDR_PATH)/include/ CFLAGS += -Wall -I$(LINUX_HDR_PATH) -TEST_GEN_PROGS := sud_test +TEST_GEN_PROGS := sud_test sud_benchmark include ../lib.mk diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c new file mode 100644 index 000000000000..6689f1183dbf --- /dev/null +++ b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2020 Collabora Ltd. + * + * Benchmark and test syscall user dispatch + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef PR_SET_SYSCALL_USER_DISPATCH +# define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 +#endif + +#ifdef __NR_syscalls +# define MAGIC_SYSCALL_1 (__NR_syscalls + 1) /* Bad Linux syscall number */ +#else +# define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ +#endif + +/* + * To test returning from a sigsys with selector blocked, the test + * requires some per-architecture support (i.e. knowledge about the + * signal trampoline address). On i386, we know it is on the vdso, and + * a small trampoline is open-coded for x86_64. Other architectures + * that have a trampoline in the vdso will support TEST_BLOCKED_RETURN + * out of the box, but don't enable them until they support syscall user + * dispatch. + */ +#if defined(__x86_64__) || defined(__i386__) +#define TEST_BLOCKED_RETURN +#endif + +#ifdef __x86_64__ +void* (syscall_dispatcher_start)(void); +void* (syscall_dispatcher_end)(void); +#else +unsigned long syscall_dispatcher_start = 0; +unsigned long syscall_dispatcher_end = 0; +#endif + +unsigned long trapped_call_count = 0; +unsigned long native_call_count = 0; + +char selector; +#define SYSCALL_BLOCK (selector = PR_SYS_DISPATCH_ON) +#define SYSCALL_UNBLOCK (selector = PR_SYS_DISPATCH_OFF) + +#define CALIBRATION_STEP 100000 +#define CALIBRATE_TO_SECS 5 +int factor; + +static double one_sysinfo_step(void) +{ + struct timespec t1, t2; + int i; + struct sysinfo info; + + clock_gettime(CLOCK_MONOTONIC, &t1); + for (i = 0; i < CALIBRATION_STEP; i++) + sysinfo(&info); + clock_gettime(CLOCK_MONOTONIC, &t2); + return (t2.tv_sec - t1.tv_sec) + 1.0e-9 * (t2.tv_nsec - t1.tv_nsec); +} + +static void calibrate_set(void) +{ + double elapsed = 0; + + printf("Calibrating test set to last ~%d seconds...\n", CALIBRATE_TO_SECS); + + while (elapsed < 1) { + elapsed += one_sysinfo_step(); + factor += CALIBRATE_TO_SECS; + } + + printf("test iterations = %d\n", CALIBRATION_STEP * factor); +} + +static double perf_syscall(void) +{ + unsigned int i; + double partial = 0; + + for (i = 0; i < factor; ++i) + partial += one_sysinfo_step()/(CALIBRATION_STEP*factor); + return partial; +} + +static void handle_sigsys(int sig, siginfo_t *info, void *ucontext) +{ + char buf[1024]; + int len; + + SYSCALL_UNBLOCK; + + /* printf and friends are not signal-safe. */ + len = snprintf(buf, 1024, "Caught sys_%x\n", info->si_syscall); + write(1, buf, len); + + if (info->si_syscall == MAGIC_SYSCALL_1) + trapped_call_count++; + else + native_call_count++; + +#ifdef TEST_BLOCKED_RETURN + SYSCALL_BLOCK; +#endif + +#ifdef __x86_64__ + __asm__ volatile("movq $0xf, %rax"); + __asm__ volatile("leaveq"); + __asm__ volatile("add $0x8, %rsp"); + __asm__ volatile("syscall_dispatcher_start:"); + __asm__ volatile("syscall"); + __asm__ volatile("nop"); /* Landing pad within dispatcher area */ + __asm__ volatile("syscall_dispatcher_end:"); +#endif + +} + +int main(void) +{ + struct sigaction act; + double time1, time2; + int ret; + sigset_t mask; + + memset(&act, 0, sizeof(act)); + sigemptyset(&mask); + + act.sa_sigaction = handle_sigsys; + act.sa_flags = SA_SIGINFO; + act.sa_mask = mask; + + calibrate_set(); + + time1 = perf_syscall(); + printf("Avg syscall time %.0lfns.\n", time1 * 1.0e9); + + ret = sigaction(SIGSYS, &act, NULL); + if (ret) { + perror("Error sigaction:"); + exit(-1); + } + + fprintf(stderr, "Enabling syscall trapping.\n"); + + if (prctl(PR_SET_SYSCALL_USER_DISPATCH, PR_SYS_DISPATCH_ON, + syscall_dispatcher_start, + (syscall_dispatcher_end - syscall_dispatcher_start + 1), + &selector)) { + perror("prctl failed\n"); + exit(-1); + } + + SYSCALL_BLOCK; + syscall(MAGIC_SYSCALL_1); + +#ifdef TEST_BLOCKED_RETURN + if (selector == PR_SYS_DISPATCH_OFF) { + fprintf(stderr, "Failed to return with selector blocked.\n"); + exit(-1); + } +#endif + + SYSCALL_UNBLOCK; + + if (!trapped_call_count) { + fprintf(stderr, "syscall trapping does not work.\n"); + exit(-1); + } + + time2 = perf_syscall(); + + if (native_call_count) { + perror("syscall trapping intercepted more syscalls than expected\n"); + exit(-1); + } + + printf("trapped_call_count %lu, native_call_count %lu.\n", + trapped_call_count, native_call_count); + printf("Avg syscall time %.0lfns.\n", time2 * 1.0e9); + printf("Interception overhead: %.1lf%% (+%.0lfns).\n", + 100.0 * (time2 / time1 - 1.0), 1.0e9 * (time2 - time1)); + return 0; + +} -- cgit v1.2.3 From a4452e671c6770e1bb80764f39995934067f70a0 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:38 -0500 Subject: docs: Document Syscall User Dispatch Explain the interface, provide some background and security notes. [ tglx: Add note about non-visibility, add it to the index and fix the kerneldoc warning ] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Kees Cook Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201127193238.821364-8-krisman@collabora.com --- Documentation/admin-guide/index.rst | 1 + .../admin-guide/syscall-user-dispatch.rst | 90 ++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 Documentation/admin-guide/syscall-user-dispatch.rst diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 4e0c4ae44acd..b29d3c127a95 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -111,6 +111,7 @@ configure specific aspects of kernel behavior to your liking. rtc serial-console svga + syscall-user-dispatch sysrq thunderbolt ufs diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst new file mode 100644 index 000000000000..a380d6515774 --- /dev/null +++ b/Documentation/admin-guide/syscall-user-dispatch.rst @@ -0,0 +1,90 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================== +Syscall User Dispatch +===================== + +Background +---------- + +Compatibility layers like Wine need a way to efficiently emulate system +calls of only a part of their process - the part that has the +incompatible code - while being able to execute native syscalls without +a high performance penalty on the native part of the process. Seccomp +falls short on this task, since it has limited support to efficiently +filter syscalls based on memory regions, and it doesn't support removing +filters. Therefore a new mechanism is necessary. + +Syscall User Dispatch brings the filtering of the syscall dispatcher +address back to userspace. The application is in control of a flip +switch, indicating the current personality of the process. A +multiple-personality application can then flip the switch without +invoking the kernel, when crossing the compatibility layer API +boundaries, to enable/disable the syscall redirection and execute +syscalls directly (disabled) or send them to be emulated in userspace +through a SIGSYS. + +The goal of this design is to provide very quick compatibility layer +boundary crosses, which is achieved by not executing a syscall to change +personality every time the compatibility layer executes. Instead, a +userspace memory region exposed to the kernel indicates the current +personality, and the application simply modifies that variable to +configure the mechanism. + +There is a relatively high cost associated with handling signals on most +architectures, like x86, but at least for Wine, syscalls issued by +native Windows code are currently not known to be a performance problem, +since they are quite rare, at least for modern gaming applications. + +Since this mechanism is designed to capture syscalls issued by +non-native applications, it must function on syscalls whose invocation +ABI is completely unexpected to Linux. Syscall User Dispatch, therefore +doesn't rely on any of the syscall ABI to make the filtering. It uses +only the syscall dispatcher address and the userspace key. + +As the ABI of these intercepted syscalls is unknown to Linux, these +syscalls are not instrumentable via ptrace or the syscall tracepoints. + +Interface +--------- + +A thread can setup this mechanism on supported kernels by executing the +following prctl: + + prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) + + is either PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF, to enable and +disable the mechanism globally for that thread. When +PR_SYS_DISPATCH_OFF is used, the other fields must be zero. + +[, +) delimit a memory region interval +from which syscalls are always executed directly, regardless of the +userspace selector. This provides a fast path for the C library, which +includes the most common syscall dispatchers in the native code +applications, and also provides a way for the signal handler to return +without triggering a nested SIGSYS on (rt\_)sigreturn. Users of this +interface should make sure that at least the signal trampoline code is +included in this region. In addition, for syscalls that implement the +trampoline code on the vDSO, that trampoline is never intercepted. + +[selector] is a pointer to a char-sized region in the process memory +region, that provides a quick way to enable disable syscall redirection +thread-wide, without the need to invoke the kernel directly. selector +can be set to PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF. Any other +value should terminate the program with a SIGSYS. + +Security Notes +-------------- + +Syscall User Dispatch provides functionality for compatibility layers to +quickly capture system calls issued by a non-native part of the +application, while not impacting the Linux native regions of the +process. It is not a mechanism for sandboxing system calls, and it +should not be seen as a security mechanism, since it is trivial for a +malicious application to subvert the mechanism by jumping to an allowed +dispatcher region prior to executing the syscall, or to discover the +address and modify the selector value. If the use case requires any +kind of security sandboxing, Seccomp should be used instead. + +Any fork or exec of the existing process resets the mechanism to +PR_SYS_DISPATCH_OFF. -- cgit v1.2.3 From 6666bb714fb3bc7b2e8be72b9c92f2d8a89ea2dc Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:51 +0100 Subject: entry: Rename enter_from_user_mode() In order to make this function publicly available rename it so it can still be inlined. An additional enter_from_user_mode() function will be added with a later commit. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-2-svens@linux.ibm.com --- kernel/entry/common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e661e70ffcf3..8e294a794e93 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,7 +11,7 @@ #include /** - * enter_from_user_mode - Establish state when coming from user mode + * __enter_from_user_mode - Establish state when coming from user mode * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -20,7 +20,7 @@ * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state */ -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { arch_check_user_regs(regs); lockdep_hardirqs_off(CALLER_ADDR0); @@ -103,7 +103,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) { long ret; - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -115,7 +115,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); @@ -304,7 +304,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From bb793562f0da7317adf6c456316bca651ff46f5d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:52 +0100 Subject: entry: Rename exit_to_user_mode() In order to make this function publicly available rename it so it can still be inlined. An additional exit_to_user_mode() function will be added with a later commit. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-3-svens@linux.ibm.com --- kernel/entry/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8e294a794e93..dff07b4ce6ec 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -122,7 +122,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) } /** - * exit_to_user_mode - Fixup state when exiting to user mode + * __exit_to_user_mode - Fixup state when exiting to user mode * * Syscall/interupt exit enables interrupts, but the kernel state is * interrupts disabled when this is invoked. Also tell RCU about it. @@ -133,7 +133,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) * mitigations, etc. * 4) Tell lockdep that interrupts are enabled */ -static __always_inline void exit_to_user_mode(void) +static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); @@ -299,7 +299,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) local_irq_disable_exit_to_user(); exit_to_user_mode_prepare(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) @@ -312,7 +312,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) -- cgit v1.2.3 From 96e2fbccd0fc806364a964fdf072bfc858a66109 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:53 +0100 Subject: entry_Add_enter_from_user_mode_wrapper To be called from architecture specific code if the combo interfaces are not suitable. It simply calls __enter_from_user_mode(). This way __enter_from_user_mode will still be inlined because it is declared static __always_inline. [ tglx: Amend comments and move it to a different location in the header ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-4-svens@linux.ibm.com --- include/linux/entry-common.h | 24 +++++++++++++++++++++++- kernel/entry/common.c | 16 ++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index a6e98b4ba8e9..da60980a2e7b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -101,6 +101,27 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs } #endif +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific syscall entry code with interrupts + * disabled. The calling code has to be non-instrumentable. When the + * function returns all state is correct and interrupts are still + * disabled. The subsequent functions can be instrumented. + * + * This is invoked when there is architecture specific functionality to be + * done between establishing state and enabling interrupts. The caller must + * enable interrupts before invoking syscall_enter_from_user_mode_work(). + */ +void enter_from_user_mode(struct pt_regs *regs); + /** * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts * @regs: Pointer to currents pt_regs @@ -110,7 +131,8 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This handles lockdep, RCU (context tracking) and tracing state. + * This handles lockdep, RCU (context tracking) and tracing state, i.e. + * the functionality provided by enter_from_user_mode(). * * This is invoked when there is extra architecture specific functionality * to be done between establishing state and handling user mode entry work. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index dff07b4ce6ec..17b1e032afe7 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -10,16 +10,7 @@ #define CREATE_TRACE_POINTS #include -/** - * __enter_from_user_mode - Establish state when coming from user mode - * - * Syscall/interrupt entry disables interrupts, but user mode is traced as - * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ +/* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { arch_check_user_regs(regs); @@ -33,6 +24,11 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } +void noinstr enter_from_user_mode(struct pt_regs *regs) +{ + __enter_from_user_mode(regs); +} + static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { -- cgit v1.2.3 From 310de1a678b2184c078c593dae343cb79c807f8d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:54 +0100 Subject: entry: Add exit_to_user_mode() wrapper Called from architecture specific code when syscall_exit_to_user_mode() is not suitable. It simply calls __exit_to_user_mode(). This way __exit_to_user_mode() can still be inlined because it is declared static __always_inline. [ tglx: Amended comments and moved it to a different place in the header ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-5-svens@linux.ibm.com --- include/linux/entry-common.h | 23 +++++++++++++++++++++-- kernel/entry/common.c | 18 ++++++------------ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index da60980a2e7b..e370be8121aa 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -300,6 +300,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) } #endif +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interrupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc.: arch_exit_to_user_mode() + * 4) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code when syscall_exit_to_user_mode() + * is not suitable as the last step before returning to userspace. Must be + * invoked with interrupts disabled and the caller must be + * non-instrumentable. + */ +void exit_to_user_mode(void); + /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs @@ -322,8 +341,8 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) * - Architecture specific one time work arch_exit_to_user_mode_prepare() * - Address limit and lockdep checks * - * 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes - * arch_exit_to_user_mode() to handle e.g. speculation mitigations + * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the + * functionality in exit_to_user_mode(). */ void syscall_exit_to_user_mode(struct pt_regs *regs); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 17b1e032afe7..48d30ce2e00e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -117,18 +117,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) instrumentation_end(); } -/** - * __exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall/interupt exit enables interrupts, but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Invoke architecture specific last minute exit code, e.g. speculation - * mitigations, etc. - * 4) Tell lockdep that interrupts are enabled - */ +/* See comment for exit_to_user_mode() in entry-common.h */ static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); @@ -141,6 +130,11 @@ static __always_inline void __exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } +void noinstr exit_to_user_mode(void) +{ + __exit_to_user_mode(); +} + /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } -- cgit v1.2.3 From c6156e1da633f241e132eaea3b676d674376d770 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:55 +0100 Subject: entry: Add syscall_exit_to_user_mode_work() This is the same as syscall_exit_to_user_mode() but without calling exit_to_user_mode(). This can be used if there is an architectural reason to avoid the combo function, e.g. restarting a syscall without returning to userspace. Before returning to user space the caller has to invoke exit_to_user_mode(). [ tglx: Amended comments ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-6-svens@linux.ibm.com --- include/linux/entry-common.h | 20 ++++++++++++++++++++ kernel/entry/common.c | 14 ++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index e370be8121aa..7c581a4c3797 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -316,9 +316,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) * is not suitable as the last step before returning to userspace. Must be * invoked with interrupts disabled and the caller must be * non-instrumentable. + * The caller has to invoke syscall_exit_to_user_mode_work() before this. */ void exit_to_user_mode(void); +/** + * syscall_exit_to_user_mode_work - Handle work before returning to user mode + * @regs: Pointer to currents pt_regs + * + * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling + * exit_to_user_mode() to perform the final transition to user mode. + * + * Calling convention is the same as for syscall_exit_to_user_mode() and it + * returns with all work handled and interrupts disabled. The caller must + * invoke exit_to_user_mode() before actually switching to user mode to + * make the final state transitions. Interrupts must stay disabled between + * return from this function and the invocation of exit_to_user_mode(). + */ +void syscall_exit_to_user_mode_work(struct pt_regs *regs); + /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs @@ -343,6 +359,10 @@ void exit_to_user_mode(void); * * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the * functionality in exit_to_user_mode(). + * + * This is a combination of syscall_exit_to_user_mode_work() (1,2) and + * exit_to_user_mode(). This function is preferred unless there is a + * compelling architectural reason to use the seperate functions. */ void syscall_exit_to_user_mode(struct pt_regs *regs); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 48d30ce2e00e..d6b73937dab3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -282,12 +282,22 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) syscall_exit_work(regs, work); } -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) { - instrumentation_begin(); syscall_exit_to_user_mode_prepare(regs); local_irq_disable_exit_to_user(); exit_to_user_mode_prepare(regs); +} + +void syscall_exit_to_user_mode_work(struct pt_regs *regs) +{ + __syscall_exit_to_user_mode_work(regs); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + __syscall_exit_to_user_mode_work(regs); instrumentation_end(); __exit_to_user_mode(); } -- cgit v1.2.3