From 8b1bc88c3c67bf95de248c35ed2855e0f4a4db90 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 10 Jul 2020 09:24:36 -0700 Subject: selftests/seccomp: Rename XFAIL to SKIP The kselftests will be renaming XFAIL to SKIP in the test harness, and to avoid painful conflicts, rename XFAIL to SKIP now in a future-proofed way. Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index c0aa46ce14f6..d4c8858fde8e 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -52,6 +52,11 @@ #include "../kselftest_harness.h" +/* Attempt to de-conflict with the selftests tree. */ +#ifndef SKIP +#define SKIP(s, ...) XFAIL(s, ##__VA_ARGS__) +#endif + #ifndef PR_SET_PTRACER # define PR_SET_PTRACER 0x59616d61 #endif @@ -3068,7 +3073,7 @@ TEST(get_metadata) /* Only real root can get metadata. */ if (geteuid()) { - XFAIL(return, "get_metadata requires real root"); + SKIP(return, "get_metadata requires real root"); return; } @@ -3111,7 +3116,7 @@ TEST(get_metadata) ret = ptrace(PTRACE_SECCOMP_GET_METADATA, pid, sizeof(md), &md); EXPECT_EQ(sizeof(md), ret) { if (errno == EINVAL) - XFAIL(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); + SKIP(goto skip, "Kernel does not support PTRACE_SECCOMP_GET_METADATA (missing CONFIG_CHECKPOINT_RESTORE?)"); } EXPECT_EQ(md.flags, SECCOMP_FILTER_FLAG_LOG); @@ -3672,7 +3677,7 @@ TEST(user_notification_continue) resp.val = 0; EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0) { if (errno == EINVAL) - XFAIL(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE"); + SKIP(goto skip, "Kernel does not support SECCOMP_USER_NOTIF_FLAG_CONTINUE"); } skip: @@ -3680,7 +3685,7 @@ skip: EXPECT_EQ(true, WIFEXITED(status)); EXPECT_EQ(0, WEXITSTATUS(status)) { if (WEXITSTATUS(status) == 2) { - XFAIL(return, "Kernel does not support kcmp() syscall"); + SKIP(return, "Kernel does not support kcmp() syscall"); return; } } -- cgit v1.2.3 From d7d2e5bb9f361dc05c7dedd38d4d9269ad8c05e2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 10 Jul 2020 10:26:11 -0700 Subject: selftests/seccomp: Add SKIPs for failed unshare() Running the seccomp tests as a regular user shouldn't just fail tests that require CAP_SYS_ADMIN (for getting a PID namespace). Instead, detect those cases and SKIP them. Additionally, gracefully SKIP missing CONFIG_USER_NS (and add to "config" since we'd prefer to actually test this case). Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/config | 1 + tools/testing/selftests/seccomp/seccomp_bpf.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/seccomp/config b/tools/testing/selftests/seccomp/config index db1e11b08c8a..64c19d8eba79 100644 --- a/tools/testing/selftests/seccomp/config +++ b/tools/testing/selftests/seccomp/config @@ -1,2 +1,3 @@ CONFIG_SECCOMP=y CONFIG_SECCOMP_FILTER=y +CONFIG_USER_NS=y diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index d4c8858fde8e..80223113f3e3 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3444,7 +3444,10 @@ TEST(user_notification_child_pid_ns) struct seccomp_notif req = {}; struct seccomp_notif_resp resp = {}; - ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0); + ASSERT_EQ(unshare(CLONE_NEWUSER | CLONE_NEWPID), 0) { + if (errno == EINVAL) + SKIP(return, "kernel missing CLONE_NEWUSER support"); + }; listener = user_trap_syscall(__NR_getppid, SECCOMP_FILTER_FLAG_NEW_LISTENER); @@ -3509,7 +3512,10 @@ TEST(user_notification_sibling_pid_ns) } /* Create the sibling ns, and sibling in it. */ - ASSERT_EQ(unshare(CLONE_NEWPID), 0); + ASSERT_EQ(unshare(CLONE_NEWPID), 0) { + if (errno == EPERM) + SKIP(return, "CLONE_NEWPID requires CAP_SYS_ADMIN"); + } ASSERT_EQ(errno, 0); pid2 = fork(); -- cgit v1.2.3 From e4d05028a07f505a08802a6d1b11674c149df2b3 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 10 Jul 2020 10:29:41 -0700 Subject: selftests/seccomp: Set NNP for TSYNC ESRCH flag test The TSYNC ESRCH flag test will fail for regular users because NNP was not set yet. Add NNP setting. Fixes: 51891498f2da ("seccomp: allow TSYNC and USER_NOTIF together") Cc: stable@vger.kernel.org Reviewed-by: Tycho Andersen Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 80223113f3e3..c8f93edc52cc 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3262,6 +3262,11 @@ TEST(user_notification_with_tsync) int ret; unsigned int flags; + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + /* these were exclusive */ flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | SECCOMP_FILTER_FLAG_TSYNC; -- cgit v1.2.3 From c818c03b661cd769e035e41673d5543ba2ebda64 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 13 May 2020 14:11:26 -0700 Subject: seccomp: Report number of loaded filters in /proc/$pid/status A common question asked when debugging seccomp filters is "how many filters are attached to your process?" Provide a way to easily answer this question through /proc/$pid/status with a "Seccomp_filters" line. Signed-off-by: Kees Cook --- fs/proc/array.c | 2 ++ include/linux/seccomp.h | 2 ++ init/init_task.c | 3 +++ kernel/seccomp.c | 3 +++ 4 files changed, 10 insertions(+) diff --git a/fs/proc/array.c b/fs/proc/array.c index 55ecbeb3a721..65ec2029fa80 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -341,6 +341,8 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p) seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); + seq_put_decimal_ull(m, "\nSeccomp_filters:\t", + atomic_read(&p->seccomp.filter_count)); #endif seq_puts(m, "\nSpeculation_Store_Bypass:\t"); switch (arch_prctl_spec_ctrl_get(p, PR_SPEC_STORE_BYPASS)) { diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 4192369b8418..2ec2720f83cc 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -13,6 +13,7 @@ #ifdef CONFIG_SECCOMP #include +#include #include struct seccomp_filter; @@ -29,6 +30,7 @@ struct seccomp_filter; */ struct seccomp { int mode; + atomic_t filter_count; struct seccomp_filter *filter; }; diff --git a/init/init_task.c b/init/init_task.c index 15089d15010a..a3eb3847e1f4 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -204,6 +204,9 @@ struct task_struct init_task #ifdef CONFIG_SECURITY .security = NULL, #endif +#ifdef CONFIG_SECCOMP + .seccomp = { .filter_count = ATOMIC_INIT(0) }, +#endif }; EXPORT_SYMBOL(init_task); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d653d8426de9..f387e5004c29 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -398,6 +398,8 @@ static inline void seccomp_sync_threads(unsigned long flags) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + atomic_set(&thread->seccomp.filter_count, + atomic_read(&thread->seccomp.filter_count)); /* * Don't let an unprivileged task work around @@ -544,6 +546,7 @@ static long seccomp_attach_filter(unsigned int flags, */ filter->prev = current->seccomp.filter; current->seccomp.filter = filter; + atomic_inc(¤t->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) -- cgit v1.2.3 From 9f87dcf14b82b05ff0e26970439b372ae135de0c Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Mon, 1 Jun 2020 04:25:32 -0700 Subject: seccomp: Add find_notification helper This adds a helper which can iterate through a seccomp_filter to find a notification matching an ID. It removes several replicated chunks of code. Signed-off-by: Sargun Dhillon Acked-by: Christian Brauner Reviewed-by: Tycho Andersen Cc: Matt Denton Cc: Kees Cook , Cc: Jann Horn , Cc: Robert Sesek , Cc: Chris Palmer Cc: Christian Brauner Cc: Tycho Andersen Link: https://lore.kernel.org/r/20200601112532.150158-1-sargun@sargun.me Signed-off-by: Kees Cook --- kernel/seccomp.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index f387e5004c29..e711d697afb6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -41,6 +41,7 @@ #include #include #include +#include enum notify_state { SECCOMP_NOTIFY_INIT, @@ -1024,6 +1025,23 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) return 0; } +/* must be called with notif_lock held */ +static inline struct seccomp_knotif * +find_notification(struct seccomp_filter *filter, u64 id) +{ + struct seccomp_knotif *cur; + + lockdep_assert_held(&filter->notify_lock); + + list_for_each_entry(cur, &filter->notif->notifications, list) { + if (cur->id == id) + return cur; + } + + return NULL; +} + + static long seccomp_notify_recv(struct seccomp_filter *filter, void __user *buf) { @@ -1081,15 +1099,8 @@ out: * may have died when we released the lock, so we need to make * sure it's still around. */ - knotif = NULL; mutex_lock(&filter->notify_lock); - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == unotif.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, unotif.id); if (knotif) { knotif->state = SECCOMP_NOTIFY_INIT; up(&filter->notif->request); @@ -1104,7 +1115,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, void __user *buf) { struct seccomp_notif_resp resp = {}; - struct seccomp_knotif *knotif = NULL, *cur; + struct seccomp_knotif *knotif; long ret; if (copy_from_user(&resp, buf, sizeof(resp))) @@ -1121,13 +1132,7 @@ static long seccomp_notify_send(struct seccomp_filter *filter, if (ret < 0) return ret; - list_for_each_entry(cur, &filter->notif->notifications, list) { - if (cur->id == resp.id) { - knotif = cur; - break; - } - } - + knotif = find_notification(filter, resp.id); if (!knotif) { ret = -ENOENT; goto out; @@ -1153,7 +1158,7 @@ out: static long seccomp_notify_id_valid(struct seccomp_filter *filter, void __user *buf) { - struct seccomp_knotif *knotif = NULL; + struct seccomp_knotif *knotif; u64 id; long ret; @@ -1164,16 +1169,12 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, if (ret < 0) return ret; - ret = -ENOENT; - list_for_each_entry(knotif, &filter->notif->notifications, list) { - if (knotif->id == id) { - if (knotif->state == SECCOMP_NOTIFY_SENT) - ret = 0; - goto out; - } - } + knotif = find_notification(filter, id); + if (knotif && knotif->state == SECCOMP_NOTIFY_SENT) + ret = 0; + else + ret = -ENOENT; -out: mutex_unlock(&filter->notify_lock); return ret; } -- cgit v1.2.3 From b707ddee11d1dc4518ab7f1aa5e7af9ceaa23317 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:28 +0200 Subject: seccomp: rename "usage" to "refs" and document Naming the lifetime counter of a seccomp filter "usage" suggests a little too strongly that its about tasks that are using this filter while it also tracks other references such as the user notifier or ptrace. This also updates the documentation to note this fact. We'll be introducing an actual usage counter in a follow-up patch. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-1-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index e711d697afb6..d4dd3344e312 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -107,10 +107,11 @@ struct notification { /** * struct seccomp_filter - container for seccomp BPF programs * - * @usage: reference count to manage the object lifetime. - * get/put helpers should be used when accessing an instance - * outside of a lifetime-guarded section. In general, this - * is only needed for handling filters shared across tasks. + * @refs: Reference count to manage the object lifetime. + * A filter's reference count is incremented for each directly + * attached task, once for the dependent filter, and if + * requested for the user notifier. When @refs reaches zero, + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -125,10 +126,10 @@ struct notification { * how namespaces work. * * seccomp_filter objects should never be modified after being attached - * to a task_struct (other than @usage). + * to a task_struct (other than @refs). */ struct seccomp_filter { - refcount_t usage; + refcount_t refs; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -464,7 +465,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) return ERR_PTR(ret); } - refcount_set(&sfilter->usage, 1); + refcount_set(&sfilter->refs, 1); return sfilter; } @@ -558,7 +559,7 @@ static long seccomp_attach_filter(unsigned int flags, static void __get_seccomp_filter(struct seccomp_filter *filter) { - refcount_inc(&filter->usage); + refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ @@ -581,7 +582,7 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->usage)) { + while (orig && refcount_dec_and_test(&orig->refs)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); -- cgit v1.2.3 From 3a15fb6ed92cb32b0a83f406aa4a96f28c9adbc3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:29 +0200 Subject: seccomp: release filter after task is fully dead The seccomp filter used to be released in free_task() which is called asynchronously via call_rcu() and assorted mechanisms. Since we need to inform tasks waiting on the seccomp notifier when a filter goes empty we will notify them as soon as a task has been marked fully dead in release_task(). To not split seccomp cleanup into two parts, move filter release out of free_task() and into release_task() after we've unhashed struct task from struct pid, exited signals, and unlinked it from the threadgroups' thread list. We'll put the empty filter notification infrastructure into it in a follow up patch. This also renames put_seccomp_filter() to seccomp_filter_release() which is a more descriptive name of what we're doing here especially once we've added the empty filter notification mechanism in there. We're also NULL-ing the task's filter tree entrypoint which seems cleaner than leaving a dangling pointer in there. Note that this shouldn't need any memory barriers since we're calling this when the task is in release_task() which means it's EXIT_DEAD. So it can't modify its seccomp filters anymore. You can also see this from the point where we're calling seccomp_filter_release(). It's after __exit_signal() and at this point, tsk->sighand will already have been NULLed which is required for thread-sync and filter installation alike. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-2-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 ++-- kernel/exit.c | 1 + kernel/fork.c | 1 - kernel/seccomp.c | 62 +++++++++++++++++++++++++++++-------------------- 4 files changed, 40 insertions(+), 28 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 2ec2720f83cc..babcd6c02d09 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -84,10 +84,10 @@ static inline int seccomp_mode(struct seccomp *s) #endif /* CONFIG_SECCOMP */ #ifdef CONFIG_SECCOMP_FILTER -extern void put_seccomp_filter(struct task_struct *tsk); +extern void seccomp_filter_release(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); #else /* CONFIG_SECCOMP_FILTER */ -static inline void put_seccomp_filter(struct task_struct *tsk) +static inline void seccomp_filter_release(struct task_struct *tsk) { return; } diff --git a/kernel/exit.c b/kernel/exit.c index 727150f28103..00d77e5ba700 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -217,6 +217,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); diff --git a/kernel/fork.c b/kernel/fork.c index 142b23645d82..c51a9cd824c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -473,7 +473,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d4dd3344e312..0ca6d5243427 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -368,6 +368,42 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +/** + * seccomp_filter_release - Detach the task from its filter tree + * and drop its reference count during + * exit. + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. As such, READ_ONCE() and + * barriers are not needed here, as would normally be needed. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *orig = tsk->seccomp.filter; + + /* Detach task from its filter tree. */ + tsk->seccomp.filter = NULL; + __put_seccomp_filter(orig); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -397,7 +433,7 @@ static inline void seccomp_sync_threads(unsigned long flags) * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __put_seccomp_filter(thread->seccomp.filter); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -571,30 +607,6 @@ void get_seccomp_filter(struct task_struct *tsk) __get_seccomp_filter(orig); } -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->refs)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); -} - static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) { clear_siginfo(info); -- cgit v1.2.3 From 76194c4e830d570d9e369d637bb907591d2b3111 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 1 Jun 2020 11:50:07 -0700 Subject: seccomp: Lift wait_queue into struct seccomp_filter Lift the wait_queue from struct notification into struct seccomp_filter. This is cleaner overall and lets us avoid having to take the notifier mutex in the future for EPOLLHUP notifications since we need to neither read nor modify the notifier specific aspects of the seccomp filter. In the exit path I'd very much like to avoid having to take the notifier mutex for each filter in the task's filter hierarchy. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/seccomp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ca6d5243427..39c1c687d8ad 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -95,13 +95,11 @@ struct seccomp_knotif { * filter->notify_lock. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. - * @wqh: A wait queue for poll. */ struct notification { struct semaphore request; u64 next_id; struct list_head notifications; - wait_queue_head_t wqh; }; /** @@ -117,6 +115,7 @@ struct notification { * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. + * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting @@ -135,6 +134,7 @@ struct seccomp_filter { struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; + wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ @@ -502,6 +502,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + init_waitqueue_head(&sfilter->wqh); return sfilter; } @@ -774,7 +775,7 @@ static int seccomp_do_user_notification(int this_syscall, list_add(&n.list, &match->notif->notifications); up(&match->notif->request); - wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM); + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); mutex_unlock(&match->notify_lock); /* @@ -1098,7 +1099,7 @@ static long seccomp_notify_recv(struct seccomp_filter *filter, unotif.data = *(knotif->data); knotif->state = SECCOMP_NOTIFY_SENT; - wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM); + wake_up_poll(&filter->wqh, EPOLLOUT | EPOLLWRNORM); ret = 0; out: mutex_unlock(&filter->notify_lock); @@ -1217,7 +1218,7 @@ static __poll_t seccomp_notify_poll(struct file *file, __poll_t ret = 0; struct seccomp_knotif *cur; - poll_wait(file, &filter->notif->wqh, poll_tab); + poll_wait(file, &filter->wqh, poll_tab); if (mutex_lock_interruptible(&filter->notify_lock) < 0) return EPOLLERR; @@ -1261,7 +1262,6 @@ static struct file *init_listener(struct seccomp_filter *filter) sema_init(&filter->notif->request, 0); filter->notif->next_id = get_random_u64(); INIT_LIST_HEAD(&filter->notif->notifications); - init_waitqueue_head(&filter->notif->wqh); ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops, filter, O_RDWR); -- cgit v1.2.3 From 99cdb8b9a57393b5978e7a6310a2cba511dd179b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:30 +0200 Subject: seccomp: notify about unused filter We've been making heavy use of the seccomp notifier to intercept and handle certain syscalls for containers. This patch allows a syscall supervisor listening on a given notifier to be notified when a seccomp filter has become unused. A container is often managed by a singleton supervisor process the so-called "monitor". This monitor process has an event loop which has various event handlers registered. If the user specified a seccomp profile that included a notifier for various syscalls then we also register a seccomp notify even handler. For any container using a separate pid namespace the lifecycle of the seccomp notifier is bound to the init process of the pid namespace, i.e. when the init process exits the filter must be unused. If a new process attaches to a container we force it to assume a seccomp profile. This can either be the same seccomp profile as the container was started with or a modified one. If the attaching process makes use of the seccomp notifier we will register a new seccomp notifier handler in the monitor's event loop. However, when the attaching process exits we can't simply delete the handler since other child processes could've been created (daemons spawned etc.) that have inherited the seccomp filter and so we need to keep the seccomp notifier fd alive in the event loop. But this is problematic since we don't get a notification when the seccomp filter has become unused and so we currently never remove the seccomp notifier fd from the event loop and just keep accumulating fds in the event loop. We've had this issue for a while but it has recently become more pressing as more and larger users make use of this. To fix this, we introduce a new "users" reference counter that tracks any tasks and dependent filters making use of a filter. When a notifier is registered waiting tasks will be notified that the filter is now empty by receiving a (E)POLLHUP event. The concept in this patch introduces is the same as for signal_struct, i.e. reference counting for life-cycle management is decoupled from reference counting taks using the object. There's probably some trickery possible but the second counter is just the correct way of doing this IMHO and has precedence. Cc: Tycho Andersen Cc: Kees Cook Cc: Matt Denton Cc: Sargun Dhillon Cc: Jann Horn Cc: Chris Palmer Cc: Aleksa Sarai Cc: Robert Sesek Cc: Jeffrey Vander Stoep Cc: Linux Containers Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-3-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- kernel/seccomp.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 39c1c687d8ad..ba30bffa9301 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -110,6 +110,14 @@ struct notification { * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. + * @users: A filter's @users count is incremented for each directly + * attached task (filter installation, fork(), thread_sync), + * and once for the dependent filter (tracked in filter->prev). + * When it reaches zero it indicates that no direct or indirect + * users of that filter exist. No new tasks can get associated with + * this filter after reaching 0. The @users count is always smaller + * or equal to @refs. Hence, reaching 0 for @users does not mean + * the filter can be freed. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -129,6 +137,7 @@ struct notification { */ struct seccomp_filter { refcount_t refs; + refcount_t users; bool log; struct seccomp_filter *prev; struct bpf_prog *prog; @@ -376,6 +385,15 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } +static void __seccomp_filter_orphan(struct seccomp_filter *orig) +{ + while (orig && refcount_dec_and_test(&orig->users)) { + if (waitqueue_active(&orig->wqh)) + wake_up_poll(&orig->wqh, EPOLLHUP); + orig = orig->prev; + } +} + static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ @@ -386,10 +404,18 @@ static void __put_seccomp_filter(struct seccomp_filter *orig) } } +static void __seccomp_filter_release(struct seccomp_filter *orig) +{ + /* Notify about any unused filters in the task's former filter tree. */ + __seccomp_filter_orphan(orig); + /* Finally drop all references to the task's former tree. */ + __put_seccomp_filter(orig); +} + /** - * seccomp_filter_release - Detach the task from its filter tree - * and drop its reference count during - * exit. + * seccomp_filter_release - Detach the task from its filter tree, + * drop its reference count, and notify + * about unused filters * * This function should only be called when the task is exiting as * it detaches it from its filter tree. As such, READ_ONCE() and @@ -401,7 +427,7 @@ void seccomp_filter_release(struct task_struct *tsk) /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; - __put_seccomp_filter(orig); + __seccomp_filter_release(orig); } /** @@ -428,12 +454,15 @@ static inline void seccomp_sync_threads(unsigned long flags) /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); + /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ - __put_seccomp_filter(thread->seccomp.filter); + __seccomp_filter_release(thread->seccomp.filter); + + /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, @@ -502,6 +531,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) } refcount_set(&sfilter->refs, 1); + refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); return sfilter; @@ -606,6 +636,7 @@ void get_seccomp_filter(struct task_struct *tsk) if (!orig) return; __get_seccomp_filter(orig); + refcount_inc(&orig->users); } static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) @@ -1234,6 +1265,9 @@ static __poll_t seccomp_notify_poll(struct file *file, mutex_unlock(&filter->notify_lock); + if (refcount_read(&filter->users) == 0) + ret |= EPOLLHUP; + return ret; } -- cgit v1.2.3 From ad5682184a811d62e56f07d25d75eeee9dffe3d9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sun, 31 May 2020 13:50:31 +0200 Subject: selftests/seccomp: Check for EPOLLHUP for user_notif This verifies we're correctly notified when a seccomp filter becomes unused when a notifier is in use. Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20200531115031.391515-4-christian.brauner@ubuntu.com Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 136 ++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index c8f93edc52cc..fc899cfbca99 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -51,6 +51,7 @@ #include #include "../kselftest_harness.h" +#include "../clone3/clone3_selftests.h" /* Attempt to de-conflict with the selftests tree. */ #ifndef SKIP @@ -3702,6 +3703,141 @@ skip: } } +TEST(user_notification_filter_empty) +{ + pid_t pid; + long ret; + int status; + struct pollfd pollfd; + struct clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + int listener; + + listener = user_trap_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + _exit(EXIT_FAILURE); + + if (dup2(listener, 200) != 200) + _exit(EXIT_FAILURE); + + close(listener); + + _exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* + * The seccomp filter has become unused so we should be notified once + * the kernel gets around to cleaning up task struct. + */ + pollfd.fd = 200; + pollfd.events = POLLHUP; + + EXPECT_GT(poll(&pollfd, 1, 2000), 0); + EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); +} + +static void *do_thread(void *data) +{ + return NULL; +} + +TEST(user_notification_filter_empty_threaded) +{ + pid_t pid; + long ret; + int status; + struct pollfd pollfd; + struct clone_args args = { + .flags = CLONE_FILES, + .exit_signal = SIGCHLD, + }; + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + pid = sys_clone3(&args, sizeof(args)); + ASSERT_GE(pid, 0); + + if (pid == 0) { + pid_t pid1, pid2; + int listener, status; + pthread_t thread; + + listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); + if (listener < 0) + _exit(EXIT_FAILURE); + + if (dup2(listener, 200) != 200) + _exit(EXIT_FAILURE); + + close(listener); + + pid1 = fork(); + if (pid1 < 0) + _exit(EXIT_FAILURE); + + if (pid1 == 0) + _exit(EXIT_SUCCESS); + + pid2 = fork(); + if (pid2 < 0) + _exit(EXIT_FAILURE); + + if (pid2 == 0) + _exit(EXIT_SUCCESS); + + if (pthread_create(&thread, NULL, do_thread, NULL) || + pthread_join(thread, NULL)) + _exit(EXIT_FAILURE); + + if (pthread_create(&thread, NULL, do_thread, NULL) || + pthread_join(thread, NULL)) + _exit(EXIT_FAILURE); + + if (waitpid(pid1, &status, 0) != pid1 || !WIFEXITED(status) || + WEXITSTATUS(status)) + _exit(EXIT_FAILURE); + + if (waitpid(pid2, &status, 0) != pid2 || !WIFEXITED(status) || + WEXITSTATUS(status)) + _exit(EXIT_FAILURE); + + exit(EXIT_SUCCESS); + } + + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + /* + * The seccomp filter has become unused so we should be notified once + * the kernel gets around to cleaning up task struct. + */ + pollfd.fd = 200; + pollfd.events = POLLHUP; + + EXPECT_GT(poll(&pollfd, 1, 2000), 0); + EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); +} + /* * TODO: * - add microbenchmarks -- cgit v1.2.3 From d3a37ea9f6e548388b83fe895c7a037bc2ec3f7f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jun 2020 12:34:44 -0700 Subject: selftests/seccomp: Expand benchmark to per-filter measurements It's useful to see how much (at a minimum) each filter adds to the syscall overhead. Add additional calculations. Signed-off-by: Kees Cook --- .../testing/selftests/seccomp/seccomp_benchmark.c | 36 +++++++++++++++++----- tools/testing/selftests/seccomp/seccomp_bpf.c | 2 -- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index 5838c8697ec3..eca13fe1fba9 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -68,32 +68,54 @@ int main(int argc, char *argv[]) }; long ret; unsigned long long samples; - unsigned long long native, filtered; + unsigned long long native, filter1, filter2; if (argc > 1) samples = strtoull(argv[1], NULL, 0); else samples = calibrate(); + printf("Current BPF sysctl settings:\n"); + system("sysctl net.core.bpf_jit_enable"); + system("sysctl net.core.bpf_jit_harden"); printf("Benchmarking %llu samples...\n", samples); + /* Native call */ native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; printf("getpid native: %llu ns\n", native); ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); assert(ret == 0); + /* One filter */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); assert(ret == 0); - filtered = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; - printf("getpid RET_ALLOW: %llu ns\n", filtered); + filter1 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 1 filter: %llu ns\n", filter1); - printf("Estimated seccomp overhead per syscall: %llu ns\n", - filtered - native); + if (filter1 == native) + printf("No overhead measured!? Try running again with more samples.\n"); - if (filtered == native) - printf("Trying running again with more samples.\n"); + /* Two filters */ + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); + assert(ret == 0); + + filter2 = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; + printf("getpid RET_ALLOW 2 filters: %llu ns\n", filter2); + + /* Calculations */ + printf("Estimated total seccomp overhead for 1 filter: %llu ns\n", + filter1 - native); + + printf("Estimated total seccomp overhead for 2 filters: %llu ns\n", + filter2 - native); + + printf("Estimated seccomp per-filter overhead: %llu ns\n", + filter2 - filter1); + + printf("Estimated seccomp entry overhead: %llu ns\n", + filter1 - native - (filter2 - filter1)); return 0; } diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index fc899cfbca99..d76ae98a34a2 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3840,7 +3840,6 @@ TEST(user_notification_filter_empty_threaded) /* * TODO: - * - add microbenchmarks * - expand NNP testing * - better arch-specific TRACE and TRAP handlers. * - endianness checking when appropriate @@ -3848,7 +3847,6 @@ TEST(user_notification_filter_empty_threaded) * - arch value testing (x86 modes especially) * - verify that FILTER_FLAG_LOG filters generate log messages * - verify that RET_LOG generates log messages - * - ... */ TEST_HARNESS_MAIN -- cgit v1.2.3 From bc32c9c86581abf7baacf71342df3b0affe367db Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 1 Jun 2020 12:50:12 -0700 Subject: selftests/seccomp: use 90s as timeout As seccomp_benchmark tries to calibrate how many samples will take more than 5 seconds to execute, it may end up picking up a number of samples that take 10 (but up to 12) seconds. As the calibration will take double that time, it takes around 20 seconds. Then, it executes the whole thing again, and then once more, with some added overhead. So, the thing might take more than 40 seconds, which is too close to the 45s timeout. That is very dependent on the system where it's executed, so may not be observed always, but it has been observed on x86 VMs. Using a 90s timeout seems safe enough. Signed-off-by: Thadeu Lima de Souza Cascardo Link: https://lore.kernel.org/r/20200601123202.1183526-1-cascardo@canonical.com Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/seccomp/settings diff --git a/tools/testing/selftests/seccomp/settings b/tools/testing/selftests/seccomp/settings new file mode 100644 index 000000000000..ba4d85f74cd6 --- /dev/null +++ b/tools/testing/selftests/seccomp/settings @@ -0,0 +1 @@ +timeout=90 -- cgit v1.2.3 From 81a0c8bc82be7c15dbf3e54832334552e6b76e2b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 6 Jun 2020 09:37:17 -0700 Subject: selftests/seccomp: Improve calibration loop The seccomp benchmark calibration loop did not need to take so long. Instead, use a simple 1 second timeout and multiply up to target. It does not need to be accurate. Signed-off-by: Kees Cook --- .../testing/selftests/seccomp/seccomp_benchmark.c | 50 ++++++++++++++-------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_benchmark.c b/tools/testing/selftests/seccomp/seccomp_benchmark.c index eca13fe1fba9..91f5a89cadac 100644 --- a/tools/testing/selftests/seccomp/seccomp_benchmark.c +++ b/tools/testing/selftests/seccomp/seccomp_benchmark.c @@ -18,9 +18,9 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples) { - pid_t pid, ret; - unsigned long long i; struct timespec start, finish; + unsigned long long i; + pid_t pid, ret; pid = getpid(); assert(clock_gettime(clk_id, &start) == 0); @@ -31,30 +31,43 @@ unsigned long long timing(clockid_t clk_id, unsigned long long samples) assert(clock_gettime(clk_id, &finish) == 0); i = finish.tv_sec - start.tv_sec; - i *= 1000000000; + i *= 1000000000ULL; i += finish.tv_nsec - start.tv_nsec; - printf("%lu.%09lu - %lu.%09lu = %llu\n", + printf("%lu.%09lu - %lu.%09lu = %llu (%.1fs)\n", finish.tv_sec, finish.tv_nsec, start.tv_sec, start.tv_nsec, - i); + i, (double)i / 1000000000.0); return i; } unsigned long long calibrate(void) { - unsigned long long i; - - printf("Calibrating reasonable sample size...\n"); + struct timespec start, finish; + unsigned long long i, samples, step = 9973; + pid_t pid, ret; + int seconds = 15; - for (i = 5; ; i++) { - unsigned long long samples = 1 << i; + printf("Calibrating sample size for %d seconds worth of syscalls ...\n", seconds); - /* Find something that takes more than 5 seconds to run. */ - if (timing(CLOCK_REALTIME, samples) / 1000000000ULL > 5) - return samples; - } + samples = 0; + pid = getpid(); + assert(clock_gettime(CLOCK_MONOTONIC, &start) == 0); + do { + for (i = 0; i < step; i++) { + ret = syscall(__NR_getpid); + assert(pid == ret); + } + assert(clock_gettime(CLOCK_MONOTONIC, &finish) == 0); + + samples += step; + i = finish.tv_sec - start.tv_sec; + i *= 1000000000ULL; + i += finish.tv_nsec - start.tv_nsec; + } while (i < 1000000000ULL); + + return samples * seconds; } int main(int argc, char *argv[]) @@ -70,15 +83,16 @@ int main(int argc, char *argv[]) unsigned long long samples; unsigned long long native, filter1, filter2; + printf("Current BPF sysctl settings:\n"); + system("sysctl net.core.bpf_jit_enable"); + system("sysctl net.core.bpf_jit_harden"); + if (argc > 1) samples = strtoull(argv[1], NULL, 0); else samples = calibrate(); - printf("Current BPF sysctl settings:\n"); - system("sysctl net.core.bpf_jit_enable"); - system("sysctl net.core.bpf_jit_harden"); - printf("Benchmarking %llu samples...\n", samples); + printf("Benchmarking %llu syscalls...\n", samples); /* Native call */ native = timing(CLOCK_PROCESS_CPUTIME_ID, samples) / samples; -- cgit v1.2.3 From e68f9d49dda1744d548426c8b4335a8d693a36d0 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 22:02:56 -0700 Subject: seccomp: Use pr_fmt Avoid open-coding "seccomp: " prefixes for pr_*() calls. Signed-off-by: Kees Cook --- kernel/seccomp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ba30bffa9301..5f0e3f3a7a5d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -13,6 +13,7 @@ * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ +#define pr_fmt(fmt) "seccomp: " fmt #include #include @@ -1873,7 +1874,7 @@ static int __init seccomp_sysctl_init(void) hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table); if (!hdr) - pr_warn("seccomp: sysctl registration failed\n"); + pr_warn("sysctl registration failed\n"); else kmemleak_not_leak(hdr); -- cgit v1.2.3 From cf8918dba2de7315a5878a56818a89cb899132da Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jun 2020 11:02:21 -0700 Subject: selftests/seccomp: Make kcmp() less required The seccomp tests are a bit noisy without CONFIG_CHECKPOINT_RESTORE (due to missing the kcmp() syscall). The seccomp tests are more accurate with kcmp(), but it's not strictly required. Refactor the tests to use alternatives (comparing fd numbers), and provide a central test for kcmp() so there is a single SKIP instead of many. Continue to produce warnings for the other tests, though. Additionally adds some more bad flag EINVAL tests to the addfd selftest. Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: Andrii Nakryiko Cc: John Fastabend Cc: KP Singh Cc: linux-kselftest@vger.kernel.org Cc: netdev@vger.kernel.org Cc: bpf@vger.kernel.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 58 ++++++++++++++++++--------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index d76ae98a34a2..c589a2d495a3 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -242,6 +242,40 @@ int seccomp(unsigned int op, unsigned int flags, void *args) #define SIBLING_EXIT_FAILURE 0xbadface #define SIBLING_EXIT_NEWPRIVS 0xbadfeed +static int __filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2) +{ +#ifdef __NR_kcmp + errno = 0; + return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2); +#else + errno = ENOSYS; + return -1; +#endif +} + +/* Have TH_LOG report actual location filecmp() is used. */ +#define filecmp(pid1, pid2, fd1, fd2) ({ \ + int _ret; \ + \ + _ret = __filecmp(pid1, pid2, fd1, fd2); \ + if (_ret != 0) { \ + if (_ret < 0 && errno == ENOSYS) { \ + TH_LOG("kcmp() syscall missing (test is less accurate)");\ + _ret = 0; \ + } \ + } \ + _ret; }) + +TEST(kcmp) +{ + int ret; + + ret = __filecmp(getpid(), getpid(), 1, 1); + EXPECT_EQ(ret, 0); + if (ret != 0 && errno == ENOSYS) + SKIP(return, "Kernel does not support kcmp() (missing CONFIG_CHECKPOINT_RESTORE?)"); +} + TEST(mode_strict_support) { long ret; @@ -3601,16 +3635,6 @@ TEST(seccomp_get_notif_sizes) EXPECT_EQ(sizes.seccomp_notif_resp, sizeof(struct seccomp_notif_resp)); } -static int filecmp(pid_t pid1, pid_t pid2, int fd1, int fd2) -{ -#ifdef __NR_kcmp - return syscall(__NR_kcmp, pid1, pid2, KCMP_FILE, fd1, fd2); -#else - errno = ENOSYS; - return -1; -#endif -} - TEST(user_notification_continue) { pid_t pid; @@ -3635,20 +3659,14 @@ TEST(user_notification_continue) int dup_fd, pipe_fds[2]; pid_t self; - ret = pipe(pipe_fds); - if (ret < 0) - exit(1); + ASSERT_GE(pipe(pipe_fds), 0); dup_fd = dup(pipe_fds[0]); - if (dup_fd < 0) - exit(1); + ASSERT_GE(dup_fd, 0); + EXPECT_NE(pipe_fds[0], dup_fd); self = getpid(); - - ret = filecmp(self, self, pipe_fds[0], dup_fd); - if (ret) - exit(2); - + ASSERT_EQ(filecmp(self, self, pipe_fds[0], dup_fd), 0); exit(0); } -- cgit v1.2.3 From 279ed8900079831ba436007dbc5e24530f026ce2 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 11 Jun 2020 11:08:48 -0700 Subject: selftests/seccomp: Rename user_trap_syscall() to user_notif_syscall() The user_trap_syscall() helper creates a filter with SECCOMP_RET_USER_NOTIF. To avoid confusion with SECCOMP_RET_TRAP, rename the helper to user_notif_syscall(). Cc: Andy Lutomirski Cc: Will Drewry Cc: Shuah Khan Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: Andrii Nakryiko Cc: John Fastabend Cc: KP Singh Cc: linux-kselftest@vger.kernel.org Cc: netdev@vger.kernel.org Cc: bpf@vger.kernel.org Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 46 +++++++++++++-------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index c589a2d495a3..43884b6625fc 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -3167,7 +3167,7 @@ skip: ASSERT_EQ(0, kill(pid, SIGKILL)); } -static int user_trap_syscall(int nr, unsigned int flags) +static int user_notif_syscall(int nr, unsigned int flags) { struct sock_filter filter[] = { BPF_STMT(BPF_LD+BPF_W+BPF_ABS, @@ -3213,7 +3213,7 @@ TEST(user_notification_basic) /* Check that we get -ENOSYS with no listener attached */ if (pid == 0) { - if (user_trap_syscall(__NR_getppid, 0) < 0) + if (user_notif_syscall(__NR_getppid, 0) < 0) exit(1); ret = syscall(__NR_getppid); exit(ret >= 0 || errno != ENOSYS); @@ -3230,13 +3230,13 @@ TEST(user_notification_basic) EXPECT_EQ(seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog), 0); /* Check that the basic notification machinery works */ - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); /* Installing a second listener in the chain should EBUSY */ - EXPECT_EQ(user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER), + EXPECT_EQ(user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER), -1); EXPECT_EQ(errno, EBUSY); @@ -3305,12 +3305,12 @@ TEST(user_notification_with_tsync) /* these were exclusive */ flags = SECCOMP_FILTER_FLAG_NEW_LISTENER | SECCOMP_FILTER_FLAG_TSYNC; - ASSERT_EQ(-1, user_trap_syscall(__NR_getppid, flags)); + ASSERT_EQ(-1, user_notif_syscall(__NR_getppid, flags)); ASSERT_EQ(EINVAL, errno); /* but now they're not */ flags |= SECCOMP_FILTER_FLAG_TSYNC_ESRCH; - ret = user_trap_syscall(__NR_getppid, flags); + ret = user_notif_syscall(__NR_getppid, flags); close(ret); ASSERT_LE(0, ret); } @@ -3328,8 +3328,8 @@ TEST(user_notification_kill_in_middle) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); /* @@ -3382,8 +3382,8 @@ TEST(user_notification_signal) ASSERT_EQ(socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair), 0); - listener = user_trap_syscall(__NR_gettid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_gettid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); pid = fork(); @@ -3452,8 +3452,8 @@ TEST(user_notification_closed_listener) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); /* @@ -3489,8 +3489,8 @@ TEST(user_notification_child_pid_ns) SKIP(return, "kernel missing CLONE_NEWUSER support"); }; - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); pid = fork(); @@ -3529,8 +3529,8 @@ TEST(user_notification_sibling_pid_ns) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); pid = fork(); @@ -3597,8 +3597,8 @@ TEST(user_notification_fault_recv) ASSERT_EQ(unshare(CLONE_NEWUSER), 0); - listener = user_trap_syscall(__NR_getppid, - SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); pid = fork(); @@ -3649,7 +3649,7 @@ TEST(user_notification_continue) TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); } - listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); ASSERT_GE(listener, 0); pid = fork(); @@ -3743,7 +3743,7 @@ TEST(user_notification_filter_empty) if (pid == 0) { int listener; - listener = user_trap_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_mknod, SECCOMP_FILTER_FLAG_NEW_LISTENER); if (listener < 0) _exit(EXIT_FAILURE); @@ -3799,7 +3799,7 @@ TEST(user_notification_filter_empty_threaded) int listener, status; pthread_t thread; - listener = user_trap_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); + listener = user_notif_syscall(__NR_dup, SECCOMP_FILTER_FLAG_NEW_LISTENER); if (listener < 0) _exit(EXIT_FAILURE); -- cgit v1.2.3 From 47e33c05f9f07cac3de833e531bcac9ae052c7ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 15 Jun 2020 15:42:46 -0700 Subject: seccomp: Fix ioctl number for SECCOMP_IOCTL_NOTIF_ID_VALID When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced it had the wrong direction flag set. While this isn't a big deal as nothing currently enforces these bits in the kernel, it should be defined correctly. Fix the define and provide support for the old command until it is no longer needed for backward compatibility. Fixes: 6a21cc50f0c7 ("seccomp: add a return code to trap to userspace") Signed-off-by: Kees Cook --- include/uapi/linux/seccomp.h | 3 ++- kernel/seccomp.c | 9 +++++++++ tools/testing/selftests/seccomp/seccomp_bpf.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index c1735455bc53..965290f7dcc2 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -123,5 +123,6 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) + #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0e3f3a7a5d..0ed57e8c49d0 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -44,6 +44,14 @@ #include #include +/* + * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the + * wrong direction flag in the ioctl number. This is the broken one, + * which the kernel needs to keep supporting until all userspaces stop + * using the wrong command number. + */ +#define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) + enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, @@ -1236,6 +1244,7 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, return seccomp_notify_recv(filter, buf); case SECCOMP_IOCTL_NOTIF_SEND: return seccomp_notify_send(filter, buf); + case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); default: diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 43884b6625fc..61f9ac200001 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -186,7 +186,7 @@ struct seccomp_metadata { #define SECCOMP_IOCTL_NOTIF_RECV SECCOMP_IOWR(0, struct seccomp_notif) #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) -#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64) +#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) struct seccomp_notif { __u64 id; -- cgit v1.2.3 From fe4bfff86ec54773df3db79e8112e3b0f820c799 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 19 Jun 2020 12:20:15 -0700 Subject: seccomp: Use -1 marker for end of mode 1 syscall list The terminator for the mode 1 syscalls list was a 0, but that could be a valid syscall number (e.g. x86_64 __NR_read). By luck, __NR_read was listed first and the loop construct would not test it, so there was no bug. However, this is fragile. Replace the terminator with -1 instead, and make the variable name for mode 1 syscall lists more descriptive. Cc: Andy Lutomirski Cc: Will Drewry Signed-off-by: Kees Cook --- arch/mips/include/asm/seccomp.h | 4 ++-- include/asm-generic/seccomp.h | 2 +- kernel/seccomp.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h index e383d7e27b93..aa809589a181 100644 --- a/arch/mips/include/asm/seccomp.h +++ b/arch/mips/include/asm/seccomp.h @@ -9,12 +9,12 @@ static inline const int *get_compat_mode1_syscalls(void) static const int syscalls_O32[] = { __NR_O32_Linux + 3, __NR_O32_Linux + 4, __NR_O32_Linux + 1, __NR_O32_Linux + 193, - 0, /* null terminated */ + -1, /* negative terminated */ }; static const int syscalls_N32[] = { __NR_N32_Linux + 0, __NR_N32_Linux + 1, __NR_N32_Linux + 58, __NR_N32_Linux + 211, - 0, /* null terminated */ + -1, /* negative terminated */ }; if (IS_ENABLED(CONFIG_MIPS32_O32) && test_thread_flag(TIF_32BIT_REGS)) diff --git a/include/asm-generic/seccomp.h b/include/asm-generic/seccomp.h index 1321ac7821d7..6b6f42bc58f9 100644 --- a/include/asm-generic/seccomp.h +++ b/include/asm-generic/seccomp.h @@ -33,7 +33,7 @@ static inline const int *get_compat_mode1_syscalls(void) static const int mode1_syscalls_32[] = { __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, - 0, /* null terminated */ + -1, /* negative terminated */ }; return mode1_syscalls_32; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ed57e8c49d0..866a432cd746 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -742,20 +742,20 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, - 0, /* null terminated */ + -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { - const int *syscall_whitelist = mode1_syscalls; + const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) - syscall_whitelist = get_compat_mode1_syscalls(); + allowed_syscalls = get_compat_mode1_syscalls(); #endif do { - if (*syscall_whitelist == this_syscall) + if (*allowed_syscalls == this_syscall) return; - } while (*++syscall_whitelist); + } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); -- cgit v1.2.3 From 9d1587adcc356c1570c9830364e7797a9c19696c Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Jul 2020 22:05:43 -0700 Subject: selftests/harness: Clean up kern-doc for fixtures The FIXTURE*() macro kern-doc examples had the wrong names for the C code examples associated with them. Fix those and clarify that FIXTURE_DATA() usage should be avoided. Cc: Shuah Khan Fixes: 74bc7c97fa88 ("kselftest: add fixture variants") Acked-by: Jakub Kicinski Signed-off-by: Kees Cook --- tools/testing/selftests/kselftest_harness.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/kselftest_harness.h b/tools/testing/selftests/kselftest_harness.h index c9f03ef93338..7f32a7099a81 100644 --- a/tools/testing/selftests/kselftest_harness.h +++ b/tools/testing/selftests/kselftest_harness.h @@ -195,8 +195,9 @@ * * .. code-block:: c * - * FIXTURE_DATA(datatype name) + * FIXTURE_DATA(datatype_name) * + * Almost always, you want just FIXTURE() instead (see below). * This call may be used when the type of the fixture data * is needed. In general, this should not be needed unless * the *self* is being passed to a helper directly. @@ -211,7 +212,7 @@ * * .. code-block:: c * - * FIXTURE(datatype name) { + * FIXTURE(fixture_name) { * type property1; * ... * }; @@ -238,7 +239,7 @@ * * .. code-block:: c * - * FIXTURE_SETUP(fixture name) { implementation } + * FIXTURE_SETUP(fixture_name) { implementation } * * Populates the required "setup" function for a fixture. An instance of the * datatype defined with FIXTURE_DATA() will be exposed as *self* for the @@ -264,7 +265,7 @@ * * .. code-block:: c * - * FIXTURE_TEARDOWN(fixture name) { implementation } + * FIXTURE_TEARDOWN(fixture_name) { implementation } * * Populates the required "teardown" function for a fixture. An instance of the * datatype defined with FIXTURE_DATA() will be exposed as *self* for the @@ -285,7 +286,7 @@ * * .. code-block:: c * - * FIXTURE_VARIANT(datatype name) { + * FIXTURE_VARIANT(fixture_name) { * type property1; * ... * }; @@ -305,8 +306,8 @@ * * .. code-block:: c * - * FIXTURE_ADD(datatype name) { - * .property1 = val1; + * FIXTURE_VARIANT_ADD(fixture_name, variant_name) { + * .property1 = val1, * ... * }; * -- cgit v1.2.3 From adeeec8472397001901c06aa41d1b2fd1068e222 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Jul 2020 22:34:04 -0700 Subject: selftests/seccomp: Refactor to use fixture variants Now that the selftest harness has variants, use them to eliminate a bunch of copy/paste duplication. Reviewed-by: Jakub Kicinski Tested-by: Will Deacon Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 199 ++++++-------------------- 1 file changed, 42 insertions(+), 157 deletions(-) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 61f9ac200001..462a30e81ca9 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1510,6 +1510,7 @@ pid_t setup_trace_fixture(struct __test_metadata *_metadata, return tracer_pid; } + void teardown_trace_fixture(struct __test_metadata *_metadata, pid_t tracer) { @@ -1789,7 +1790,7 @@ void change_syscall(struct __test_metadata *_metadata, EXPECT_EQ(0, ret); } -void tracer_syscall(struct __test_metadata *_metadata, pid_t tracee, +void tracer_seccomp(struct __test_metadata *_metadata, pid_t tracee, int status, void *args) { int ret; @@ -1866,6 +1867,24 @@ FIXTURE(TRACE_syscall) { pid_t tracer, mytid, mypid, parent; }; +FIXTURE_VARIANT(TRACE_syscall) { + /* + * All of the SECCOMP_RET_TRACE behaviors can be tested with either + * SECCOMP_RET_TRACE+PTRACE_CONT or plain ptrace()+PTRACE_SYSCALL. + * This indicates if we should use SECCOMP_RET_TRACE (false), or + * ptrace (true). + */ + bool use_ptrace; +}; + +FIXTURE_VARIANT_ADD(TRACE_syscall, ptrace) { + .use_ptrace = true, +}; + +FIXTURE_VARIANT_ADD(TRACE_syscall, seccomp) { + .use_ptrace = false, +}; + FIXTURE_SETUP(TRACE_syscall) { struct sock_filter filter[] = { @@ -1881,12 +1900,11 @@ FIXTURE_SETUP(TRACE_syscall) BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_TRACE | 0x1005), BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), }; - - memset(&self->prog, 0, sizeof(self->prog)); - self->prog.filter = malloc(sizeof(filter)); - ASSERT_NE(NULL, self->prog.filter); - memcpy(self->prog.filter, filter, sizeof(filter)); - self->prog.len = (unsigned short)ARRAY_SIZE(filter); + struct sock_fprog prog = { + .len = (unsigned short)ARRAY_SIZE(filter), + .filter = filter, + }; + long ret; /* Prepare some testable syscall results. */ self->mytid = syscall(__NR_gettid); @@ -1904,60 +1922,28 @@ FIXTURE_SETUP(TRACE_syscall) ASSERT_NE(self->parent, self->mypid); /* Launch tracer. */ - self->tracer = setup_trace_fixture(_metadata, tracer_syscall, NULL, - false); -} - -FIXTURE_TEARDOWN(TRACE_syscall) -{ - teardown_trace_fixture(_metadata, self->tracer); - if (self->prog.filter) - free(self->prog.filter); -} + self->tracer = setup_trace_fixture(_metadata, + variant->use_ptrace ? tracer_ptrace + : tracer_seccomp, + NULL, variant->use_ptrace); -TEST_F(TRACE_syscall, ptrace_syscall_redirected) -{ - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ - teardown_trace_fixture(_metadata, self->tracer); - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, - true); - - /* Tracer will redirect getpid to getppid. */ - EXPECT_NE(self->mypid, syscall(__NR_getpid)); -} + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret); -TEST_F(TRACE_syscall, ptrace_syscall_errno) -{ - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ - teardown_trace_fixture(_metadata, self->tracer); - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, - true); + if (variant->use_ptrace) + return; - /* Tracer should skip the open syscall, resulting in ESRCH. */ - EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat)); + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); + ASSERT_EQ(0, ret); } -TEST_F(TRACE_syscall, ptrace_syscall_faked) +FIXTURE_TEARDOWN(TRACE_syscall) { - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ teardown_trace_fixture(_metadata, self->tracer); - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, - true); - - /* Tracer should skip the gettid syscall, resulting fake pid. */ - EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid)); } TEST_F(TRACE_syscall, syscall_allowed) { - long ret; - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - /* getppid works as expected (no changes). */ EXPECT_EQ(self->parent, syscall(__NR_getppid)); EXPECT_NE(self->mypid, syscall(__NR_getppid)); @@ -1965,14 +1951,6 @@ TEST_F(TRACE_syscall, syscall_allowed) TEST_F(TRACE_syscall, syscall_redirected) { - long ret; - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - /* getpid has been redirected to getppid as expected. */ EXPECT_EQ(self->parent, syscall(__NR_getpid)); EXPECT_NE(self->mypid, syscall(__NR_getpid)); @@ -1980,33 +1958,17 @@ TEST_F(TRACE_syscall, syscall_redirected) TEST_F(TRACE_syscall, syscall_errno) { - long ret; - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - - /* openat has been skipped and an errno return. */ + /* Tracer should skip the open syscall, resulting in ESRCH. */ EXPECT_SYSCALL_RETURN(-ESRCH, syscall(__NR_openat)); } TEST_F(TRACE_syscall, syscall_faked) { - long ret; - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - - /* gettid has been skipped and an altered return value stored. */ + /* Tracer skips the gettid syscall and store altered return value. */ EXPECT_SYSCALL_RETURN(45000, syscall(__NR_gettid)); } -TEST_F(TRACE_syscall, skip_after_RET_TRACE) +TEST_F(TRACE_syscall, skip_after) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -2021,14 +1983,7 @@ TEST_F(TRACE_syscall, skip_after_RET_TRACE) }; long ret; - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - /* Install fixture filter. */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - - /* Install "errno on getppid" filter. */ + /* Install additional "errno on getppid" filter. */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); ASSERT_EQ(0, ret); @@ -2038,7 +1993,7 @@ TEST_F(TRACE_syscall, skip_after_RET_TRACE) EXPECT_EQ(EPERM, errno); } -TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS) +TEST_F_SIGNAL(TRACE_syscall, kill_after, SIGSYS) { struct sock_filter filter[] = { BPF_STMT(BPF_LD|BPF_W|BPF_ABS, @@ -2053,77 +2008,7 @@ TEST_F_SIGNAL(TRACE_syscall, kill_after_RET_TRACE, SIGSYS) }; long ret; - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - /* Install fixture filter. */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &self->prog, 0, 0); - ASSERT_EQ(0, ret); - - /* Install "death on getppid" filter. */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); - ASSERT_EQ(0, ret); - - /* Tracer will redirect getpid to getppid, and we should die. */ - EXPECT_NE(self->mypid, syscall(__NR_getpid)); -} - -TEST_F(TRACE_syscall, skip_after_ptrace) -{ - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ERRNO | EPERM), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; - long ret; - - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ - teardown_trace_fixture(_metadata, self->tracer); - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, - true); - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - /* Install "errno on getppid" filter. */ - ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); - ASSERT_EQ(0, ret); - - /* Tracer will redirect getpid to getppid, and we should see EPERM. */ - EXPECT_EQ(-1, syscall(__NR_getpid)); - EXPECT_EQ(EPERM, errno); -} - -TEST_F_SIGNAL(TRACE_syscall, kill_after_ptrace, SIGSYS) -{ - struct sock_filter filter[] = { - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, - offsetof(struct seccomp_data, nr)), - BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_getppid, 0, 1), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), - BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), - }; - struct sock_fprog prog = { - .len = (unsigned short)ARRAY_SIZE(filter), - .filter = filter, - }; - long ret; - - /* Swap SECCOMP_RET_TRACE tracer for PTRACE_SYSCALL tracer. */ - teardown_trace_fixture(_metadata, self->tracer); - self->tracer = setup_trace_fixture(_metadata, tracer_ptrace, NULL, - true); - - ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); - ASSERT_EQ(0, ret); - - /* Install "death on getppid" filter. */ + /* Install additional "death on getppid" filter. */ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0); ASSERT_EQ(0, ret); -- cgit v1.2.3 From 11eb004ef7ead0c43586adea0340b51da095bdee Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 4 Jul 2020 22:57:33 -0700 Subject: selftests/seccomp: Check ENOSYS under tracing There should be no difference between -1 and other negative syscalls while tracing. Cc: Keno Fischer Tested-by: Will Deacon Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 462a30e81ca9..13b88d578db6 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -1942,6 +1942,26 @@ FIXTURE_TEARDOWN(TRACE_syscall) teardown_trace_fixture(_metadata, self->tracer); } +TEST(negative_ENOSYS) +{ + /* + * There should be no difference between an "internal" skip + * and userspace asking for syscall "-1". + */ + errno = 0; + EXPECT_EQ(-1, syscall(-1)); + EXPECT_EQ(errno, ENOSYS); + /* And no difference for "still not valid but not -1". */ + errno = 0; + EXPECT_EQ(-1, syscall(-101)); + EXPECT_EQ(errno, ENOSYS); +} + +TEST_F(TRACE_syscall, negative_ENOSYS) +{ + negative_ENOSYS(_metadata); +} + TEST_F(TRACE_syscall, syscall_allowed) { /* getppid works as expected (no changes). */ -- cgit v1.2.3 From d9539752d23283db4692384a634034f451261e29 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:11:29 -0700 Subject: net/compat: Add missing sock updates for SCM_RIGHTS Add missed sock updates to compat path via a new helper, which will be used more in coming patches. (The net/core/scm.c code is left as-is here to assist with -stable backports for the compat path.) Cc: Christoph Hellwig Cc: Sargun Dhillon Cc: Jakub Kicinski Cc: stable@vger.kernel.org Fixes: 48a87cc26c13 ("net: netprio: fd passed in SCM_RIGHTS datagram not set correctly") Fixes: d84295067fc7 ("net: net_cls: fd passed in SCM_RIGHTS datagram not set correctly") Acked-by: Christian Brauner Signed-off-by: Kees Cook --- include/net/sock.h | 4 ++++ net/compat.c | 1 + net/core/sock.c | 21 +++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index c53cc42b5ab9..2be67f1ee8b1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -890,6 +890,8 @@ static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } + +void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) @@ -897,6 +899,8 @@ static inline int sk_memalloc_socks(void) return 0; } +static inline void __receive_sock(struct file *file) +{ } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) diff --git a/net/compat.c b/net/compat.c index 5e3041a2c37d..2937b816107d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -309,6 +309,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) break; } /* Bump the usage count and install the file. */ + __receive_sock(fp[i]); fd_install(new_fd, get_file(fp[i])); } diff --git a/net/core/sock.c b/net/core/sock.c index 6c4acf1f0220..bde394979041 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2840,6 +2840,27 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct * } EXPORT_SYMBOL(sock_no_mmap); +/* + * When a file is received (via SCM_RIGHTS, etc), we must bump the + * various sock-based usage counts. + */ +void __receive_sock(struct file *file) +{ + struct socket *sock; + int error; + + /* + * The resulting value of "error" is ignored here since we only + * need to take action when the file is a socket and testing + * "sock" for NULL is sufficient. + */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } +} + ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { ssize_t res; -- cgit v1.2.3 From 4969f8a073977123504609d7310b42a588297aa4 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Add missing sock updates for pidfd_getfd() The sock counting (sock_update_netprioidx() and sock_update_classid()) was missing from pidfd's implementation of received fd installation. Add a call to the new __receive_sock() helper. Cc: Christian Brauner Cc: Christoph Hellwig Cc: Sargun Dhillon Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: stable@vger.kernel.org Fixes: 8649c322f75c ("pid: Implement pidfd_getfd syscall") Signed-off-by: Kees Cook --- kernel/pid.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index f1496b757162..ee58530d1aca 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -642,10 +643,12 @@ static int pidfd_getfd(struct pid *pid, int fd) } ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) + if (ret < 0) { fput(file); - else + } else { + __receive_sock(file); fd_install(ret, file); + } return ret; } -- cgit v1.2.3 From c0029de50982c1fb215330a5f9d433cec0cfd8cc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:11:29 -0700 Subject: net/scm: Regularize compat handling of scm_detach_fds() Duplicate the cleanups from commit 2618d530dd8b ("net/scm: cleanup scm_detach_fds") into the compat code. Replace open-coded __receive_sock() with a call to the helper. Move the check added in commit 1f466e1f15cf ("net: cleanly handle kernel vs user buffers for ->msg_control") to before the compat call, even though it should be impossible for an in-kernel call to also be compat. Correct the int "flags" argument to unsigned int to match fd_install() and similar APIs. Regularize any remaining differences, including a whitespace issue, a checkpatch warning, and add the check from commit 6900317f5eff ("net, scm: fix PaX detected msg_controllen overflow in scm_detach_fds") which fixed an overflow unique to 64-bit. To avoid confusion when comparing the compat handler to the native handler, just include the same check in the compat handler. Cc: Christoph Hellwig Cc: Sargun Dhillon Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Acked-by: Christian Brauner Signed-off-by: Kees Cook --- include/net/scm.h | 1 + net/compat.c | 56 +++++++++++++++++++++++++------------------------------ net/core/scm.c | 27 +++++++++++---------------- 3 files changed, 37 insertions(+), 47 deletions(-) diff --git a/include/net/scm.h b/include/net/scm.h index 1ce365f4c256..581a94d6c613 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -37,6 +37,7 @@ struct scm_cookie { #endif }; +int __scm_install_fd(struct file *file, int __user *ufd, unsigned int o_flags); void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); diff --git a/net/compat.c b/net/compat.c index 2937b816107d..27d477fdcaa0 100644 --- a/net/compat.c +++ b/net/compat.c @@ -281,40 +281,31 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat return 0; } -void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) +static int scm_max_fds_compat(struct msghdr *msg) { - struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; - int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; - int err = 0, i; + if (msg->msg_controllen <= sizeof(struct compat_cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); +} - if (fdnum < fdmax) - fdmax = fdnum; +void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) +{ + struct compat_cmsghdr __user *cm = + (struct compat_cmsghdr __user *)msg->msg_control; + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds_compat(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); + int err = 0, i; - for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) { - int new_fd; - err = security_file_receive(fp[i]); + for (i = 0; i < fdmax; i++) { + err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags - ? O_CLOEXEC : 0); - if (err < 0) - break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - __receive_sock(fp[i]); - fd_install(new_fd, get_file(fp[i])); } if (i > 0) { int cmlen = CMSG_COMPAT_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); @@ -322,16 +313,19 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) err = put_user(cmlen, &cm->cmsg_len); if (!err) { cmlen = CMSG_COMPAT_SPACE(i * sizeof(int)); - kmsg->msg_control += cmlen; - kmsg->msg_controllen -= cmlen; + if (msg->msg_controllen < cmlen) + cmlen = msg->msg_controllen; + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; } } - if (i < fdnum) - kmsg->msg_flags |= MSG_CTRUNC; + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } diff --git a/net/core/scm.c b/net/core/scm.c index 875df1c2989d..44f03213dcab 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -280,9 +280,8 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); -static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) +int __scm_install_fd(struct file *file, int __user *ufd, unsigned int o_flags) { - struct socket *sock; int new_fd; int error; @@ -300,12 +299,8 @@ static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) return error; } - /* Bump the usage count and install the file. */ - sock = sock_from_file(file, &error); - if (sock) { - sock_update_netprioidx(&sock->sk->sk_cgrp_data); - sock_update_classid(&sock->sk->sk_cgrp_data); - } + /* Bump the sock usage counts, if any. */ + __receive_sock(file); fd_install(new_fd, get_file(file)); return 0; } @@ -319,29 +314,29 @@ static int scm_max_fds(struct msghdr *msg) void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user*)msg->msg_control; - int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + struct cmsghdr __user *cm = + (__force struct cmsghdr __user *)msg->msg_control; + unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - /* no use for FD passing from kernel space callers */ - if (WARN_ON_ONCE(!msg->msg_control_is_user)) - return; - for (i = 0; i < fdmax; i++) { err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; } - if (i > 0) { + if (i > 0) { int cmlen = CMSG_LEN(i * sizeof(int)); err = put_user(SOL_SOCKET, &cm->cmsg_level); -- cgit v1.2.3 From 6659061045cc93f609e100b128f30581e5f012e9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Jun 2020 08:20:05 -0700 Subject: fs: Move __scm_install_fd() to __receive_fd() In preparation for users of the "install a received file" logic outside of net/ (pidfd and seccomp), relocate and rename __scm_install_fd() from net/core/scm.c to __receive_fd() in fs/file.c, and provide a wrapper named receive_fd_user(), as future patches will change the interface to __receive_fd(). Additionally add a comment to fd_install() as a counterpoint to how __receive_fd() interacts with fput(). Cc: Alexander Viro Cc: "David S. Miller" Cc: Jakub Kicinski Cc: Dmitry Kadashev Cc: Jens Axboe Cc: Arnd Bergmann Cc: Sargun Dhillon Cc: Ido Schimmel Cc: Ioana Ciornei Cc: linux-fsdevel@vger.kernel.org Cc: netdev@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- fs/file.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/file.h | 8 ++++++++ include/net/scm.h | 1 - net/compat.c | 2 +- net/core/scm.c | 27 +-------------------------- 5 files changed, 55 insertions(+), 28 deletions(-) diff --git a/fs/file.c b/fs/file.c index abb8b7081d7a..0cd598cab476 100644 --- a/fs/file.c +++ b/fs/file.c @@ -18,6 +18,7 @@ #include #include #include +#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -613,6 +614,10 @@ void __fd_install(struct files_struct *files, unsigned int fd, rcu_read_unlock_sched(); } +/* + * This consumes the "file" refcount, so callers should treat it + * as if they had called fput(file). + */ void fd_install(unsigned int fd, struct file *file) { __fd_install(current->files, fd, file); @@ -931,6 +936,46 @@ out_unlock: return err; } +/** + * __receive_fd() - Install received file into file descriptor table + * + * @file: struct file that was received from another process + * @ufd: __user pointer to write new fd number to + * @o_flags: the O_* flags to apply to the new fd entry + * + * Installs a received file into the file descriptor table, with appropriate + * checks and count updates. Writes the fd number to userspace. + * + * This helper handles its own reference counting of the incoming + * struct file. + * + * Returns -ve on error. + */ +int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +{ + int new_fd; + int error; + + error = security_file_receive(file); + if (error) + return error; + + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } + + /* Bump the sock usage counts, if any. */ + __receive_sock(file); + fd_install(new_fd, get_file(file)); + return 0; +} + static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/include/linux/file.h b/include/linux/file.h index 122f80084a3e..b14ff2ffd0bd 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -91,6 +91,14 @@ extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); +extern int __receive_fd(struct file *file, int __user *ufd, + unsigned int o_flags); +static inline int receive_fd_user(struct file *file, int __user *ufd, + unsigned int o_flags) +{ + return __receive_fd(file, ufd, o_flags); +} + extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); diff --git a/include/net/scm.h b/include/net/scm.h index 581a94d6c613..1ce365f4c256 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -37,7 +37,6 @@ struct scm_cookie { #endif }; -int __scm_install_fd(struct file *file, int __user *ufd, unsigned int o_flags); void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); diff --git a/net/compat.c b/net/compat.c index 27d477fdcaa0..e74cd3dae8b0 100644 --- a/net/compat.c +++ b/net/compat.c @@ -298,7 +298,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) int err = 0, i; for (i = 0; i < fdmax; i++) { - err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); + err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; } diff --git a/net/core/scm.c b/net/core/scm.c index 44f03213dcab..67c166a7820d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -280,31 +280,6 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); -int __scm_install_fd(struct file *file, int __user *ufd, unsigned int o_flags) -{ - int new_fd; - int error; - - error = security_file_receive(file); - if (error) - return error; - - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; - - error = put_user(new_fd, ufd); - if (error) { - put_unused_fd(new_fd); - return error; - } - - /* Bump the sock usage counts, if any. */ - __receive_sock(file); - fd_install(new_fd, get_file(file)); - return 0; -} - static int scm_max_fds(struct msghdr *msg) { if (msg->msg_controllen <= sizeof(struct cmsghdr)) @@ -331,7 +306,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) } for (i = 0; i < fdmax; i++) { - err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); + err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; } -- cgit v1.2.3 From deefa7f3505ae2fb6a7cb75f50134b65a1dd1494 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Jun 2020 20:47:45 -0700 Subject: fs: Add receive_fd() wrapper for __receive_fd() For both pidfd and seccomp, the __user pointer is not used. Update __receive_fd() to make writing to ufd optional via a NULL check. However, for the receive_fd_user() wrapper, ufd is NULL checked so an -EFAULT can be returned to avoid changing the SCM_RIGHTS interface behavior. Add new wrapper receive_fd() for pidfd and seccomp that does not use the ufd argument. For the new helper, the allocated fd needs to be returned on success. Update the existing callers to handle it. Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- fs/file.c | 17 ++++++++++------- include/linux/file.h | 7 +++++++ net/compat.c | 2 +- net/core/scm.c | 2 +- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/file.c b/fs/file.c index 0cd598cab476..56d96d5c0c9f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -944,12 +944,13 @@ out_unlock: * @o_flags: the O_* flags to apply to the new fd entry * * Installs a received file into the file descriptor table, with appropriate - * checks and count updates. Writes the fd number to userspace. + * checks and count updates. Optionally writes the fd number to userspace, if + * @ufd is non-NULL. * * This helper handles its own reference counting of the incoming * struct file. * - * Returns -ve on error. + * Returns newly install fd or -ve on error. */ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { @@ -964,16 +965,18 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) if (new_fd < 0) return new_fd; - error = put_user(new_fd, ufd); - if (error) { - put_unused_fd(new_fd); - return error; + if (ufd) { + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } } /* Bump the sock usage counts, if any. */ __receive_sock(file); fd_install(new_fd, get_file(file)); - return 0; + return new_fd; } static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) diff --git a/include/linux/file.h b/include/linux/file.h index b14ff2ffd0bd..d9fee9f5c8da 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -9,6 +9,7 @@ #include #include #include +#include struct file; @@ -96,8 +97,14 @@ extern int __receive_fd(struct file *file, int __user *ufd, static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { + if (ufd == NULL) + return -EFAULT; return __receive_fd(file, ufd, o_flags); } +static inline int receive_fd(struct file *file, unsigned int o_flags) +{ + return __receive_fd(file, NULL, o_flags); +} extern void flush_delayed_fput(void); extern void __fput_sync(struct file *); diff --git a/net/compat.c b/net/compat.c index e74cd3dae8b0..dc7ddbc2b15e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -299,7 +299,7 @@ void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm) for (i = 0; i < fdmax; i++) { err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); - if (err) + if (err < 0) break; } diff --git a/net/core/scm.c b/net/core/scm.c index 67c166a7820d..8156d4fb8a39 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -307,7 +307,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) for (i = 0; i < fdmax; i++) { err = receive_fd_user(scm->fp->fp[i], cmsg_data + i, o_flags); - if (err) + if (err < 0) break; } -- cgit v1.2.3 From 910d2f16ac90463a1f5b03d53246c443e2b354b9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 9 Jun 2020 16:21:38 -0700 Subject: pidfd: Replace open-coded receive_fd() Replace the open-coded version of receive_fd() with a call to the new helper. Thanks to Vamshi K Sthambamkadi for catching a missed fput() in an earlier version of this patch. Cc: Christoph Hellwig Cc: Jakub Kicinski Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- kernel/pid.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index ee58530d1aca..da5aea5f04fa 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -636,19 +636,8 @@ static int pidfd_getfd(struct pid *pid, int fd) if (IS_ERR(file)) return PTR_ERR(file); - ret = security_file_receive(file); - if (ret) { - fput(file); - return ret; - } - - ret = get_unused_fd_flags(O_CLOEXEC); - if (ret < 0) { - fput(file); - } else { - __receive_sock(file); - fd_install(ret, file); - } + ret = receive_fd(file, O_CLOEXEC); + fput(file); return ret; } -- cgit v1.2.3 From 173817151b15d5a72a9bef1d2df7e6e7f6750f2e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Jun 2020 08:46:58 -0700 Subject: fs: Expand __receive_fd() to accept existing fd Expand __receive_fd() with support for replace_fd() for the coming seccomp "addfd" ioctl(). Add new wrapper receive_fd_replace() for the new behavior and update existing wrappers to retain old behavior. Thanks to Colin Ian King for pointing out an uninitialized variable exposure in an earlier version of this patch. Cc: Alexander Viro Cc: Dmitry Kadashev Cc: Jens Axboe Cc: Arnd Bergmann Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Sargun Dhillon Acked-by: Christian Brauner Signed-off-by: Kees Cook --- fs/file.c | 25 +++++++++++++++++++------ include/linux/file.h | 10 +++++++--- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/file.c b/fs/file.c index 56d96d5c0c9f..4fb111735d1d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -939,6 +939,7 @@ out_unlock: /** * __receive_fd() - Install received file into file descriptor table * + * @fd: fd to install into (if negative, a new fd will be allocated) * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -952,7 +953,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) +int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -961,21 +962,33 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) if (error) return error; - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; + if (fd < 0) { + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + } else { + new_fd = fd; + } if (ufd) { error = put_user(new_fd, ufd); if (error) { - put_unused_fd(new_fd); + if (fd < 0) + put_unused_fd(new_fd); return error; } } + if (fd < 0) { + fd_install(new_fd, get_file(file)); + } else { + error = replace_fd(new_fd, file, o_flags); + if (error) + return error; + } + /* Bump the sock usage counts, if any. */ __receive_sock(file); - fd_install(new_fd, get_file(file)); return new_fd; } diff --git a/include/linux/file.h b/include/linux/file.h index d9fee9f5c8da..225982792fa2 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -92,18 +92,22 @@ extern void put_unused_fd(unsigned int fd); extern void fd_install(unsigned int fd, struct file *file); -extern int __receive_fd(struct file *file, int __user *ufd, +extern int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags); static inline int receive_fd_user(struct file *file, int __user *ufd, unsigned int o_flags) { if (ufd == NULL) return -EFAULT; - return __receive_fd(file, ufd, o_flags); + return __receive_fd(-1, file, ufd, o_flags); } static inline int receive_fd(struct file *file, unsigned int o_flags) { - return __receive_fd(file, NULL, o_flags); + return __receive_fd(-1, file, NULL, o_flags); +} +static inline int receive_fd_replace(int fd, struct file *file, unsigned int o_flags) +{ + return __receive_fd(fd, file, NULL, o_flags); } extern void flush_delayed_fput(void); -- cgit v1.2.3 From 7cf97b12545503992020796c74bd84078eb39299 Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:43 -0700 Subject: seccomp: Introduce addfd ioctl to seccomp user notifier The current SECCOMP_RET_USER_NOTIF API allows for syscall supervision over an fd. It is often used in settings where a supervising task emulates syscalls on behalf of a supervised task in userspace, either to further restrict the supervisee's syscall abilities or to circumvent kernel enforced restrictions the supervisor deems safe to lift (e.g. actually performing a mount(2) for an unprivileged container). While SECCOMP_RET_USER_NOTIF allows for the interception of any syscall, only a certain subset of syscalls could be correctly emulated. Over the last few development cycles, the set of syscalls which can't be emulated has been reduced due to the addition of pidfd_getfd(2). With this we are now able to, for example, intercept syscalls that require the supervisor to operate on file descriptors of the supervisee such as connect(2). However, syscalls that cause new file descriptors to be installed can not currently be correctly emulated since there is no way for the supervisor to inject file descriptors into the supervisee. This patch adds a new addfd ioctl to remove this restriction by allowing the supervisor to install file descriptors into the intercepted task. By implementing this feature via seccomp the supervisor effectively instructs the supervisee to install a set of file descriptors into its own file descriptor table during the intercepted syscall. This way it is possible to intercept syscalls such as open() or accept(), and install (or replace, like dup2(2)) the supervisor's resulting fd into the supervisee. One replacement use-case would be to redirect the stdout and stderr of a supervisee into log file descriptors opened by the supervisor. The ioctl handling is based on the discussions[1] of how Extensible Arguments should interact with ioctls. Instead of building size into the addfd structure, make it a function of the ioctl command (which is how sizes are normally passed to ioctls). To support forward and backward compatibility, just mask out the direction and size, and match everything. The size (and any future direction) checks are done along with copy_struct_from_user() logic. As a note, the seccomp_notif_addfd structure is laid out based on 8-byte alignment without requiring packing as there have been packing issues with uapi highlighted before[2][3]. Although we could overload the newfd field and use -1 to indicate that it is not to be used, doing so requires changing the size of the fd field, and introduces struct packing complexity. [1]: https://lore.kernel.org/lkml/87o8w9bcaf.fsf@mid.deneb.enyo.de/ [2]: https://lore.kernel.org/lkml/a328b91d-fd8f-4f27-b3c2-91a9c45f18c0@rasmusvillemoes.dk/ [3]: https://lore.kernel.org/lkml/20200612104629.GA15814@ircssh-2.c.rugged-nimbus-611.internal Cc: Christoph Hellwig Cc: Christian Brauner Cc: Tycho Andersen Cc: Jann Horn Cc: Robert Sesek Cc: Chris Palmer Cc: Al Viro Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-api@vger.kernel.org Suggested-by: Matt Denton Link: https://lore.kernel.org/r/20200603011044.7972-4-sargun@sargun.me Signed-off-by: Sargun Dhillon Reviewed-by: Will Drewry Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- include/linux/seccomp.h | 4 + include/uapi/linux/seccomp.h | 22 ++++++ kernel/seccomp.c | 175 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 199 insertions(+), 2 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index babcd6c02d09..881c90b6aa25 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -10,6 +10,10 @@ SECCOMP_FILTER_FLAG_NEW_LISTENER | \ SECCOMP_FILTER_FLAG_TSYNC_ESRCH) +/* sizeof() the first published struct seccomp_notif_addfd */ +#define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24 +#define SECCOMP_NOTIFY_ADDFD_SIZE_LATEST SECCOMP_NOTIFY_ADDFD_SIZE_VER0 + #ifdef CONFIG_SECCOMP #include diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h index 965290f7dcc2..6ba18b82a02e 100644 --- a/include/uapi/linux/seccomp.h +++ b/include/uapi/linux/seccomp.h @@ -113,6 +113,25 @@ struct seccomp_notif_resp { __u32 flags; }; +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +/** + * struct seccomp_notif_addfd + * @id: The ID of the seccomp notification + * @flags: SECCOMP_ADDFD_FLAG_* + * @srcfd: The local fd number + * @newfd: Optional remote FD number if SETFD option is set, otherwise 0. + * @newfd_flags: The O_* flags the remote FD should have applied + */ +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; + #define SECCOMP_IOC_MAGIC '!' #define SECCOMP_IO(nr) _IO(SECCOMP_IOC_MAGIC, nr) #define SECCOMP_IOR(nr, type) _IOR(SECCOMP_IOC_MAGIC, nr, type) @@ -124,5 +143,8 @@ struct seccomp_notif_resp { #define SECCOMP_IOCTL_NOTIF_SEND SECCOMP_IOWR(1, \ struct seccomp_notif_resp) #define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOW(2, __u64) +/* On success, the return value is the remote process's added fd number */ +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ + struct seccomp_notif_addfd) #endif /* _UAPI_LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 866a432cd746..3ee59ce0a323 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -87,10 +87,42 @@ struct seccomp_knotif { long val; u32 flags; - /* Signals when this has entered SECCOMP_NOTIFY_REPLIED */ + /* + * Signals when this has changed states, such as the listener + * dying, a new seccomp addfd message, or changing to REPLIED + */ struct completion ready; struct list_head list; + + /* outstanding addfd requests */ + struct list_head addfd; +}; + +/** + * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages + * + * @file: A reference to the file to install in the other task + * @fd: The fd number to install it at. If the fd number is -1, it means the + * installing process should allocate the fd as normal. + * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC + * is allowed. + * @ret: The return value of the installing process. It is set to the fd num + * upon success (>= 0). + * @completion: Indicates that the installing process has completed fd + * installation, or gone away (either due to successful + * reply, or signal) + * + */ +struct seccomp_kaddfd { + struct file *file; + int fd; + unsigned int flags; + + /* To only be set on reply */ + int ret; + struct completion completion; + struct list_head list; }; /** @@ -793,6 +825,17 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) return filter->notif->next_id++; } +static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd) +{ + /* + * Remove the notification, and reset the list pointers, indicating + * that it has been handled. + */ + list_del_init(&addfd->list); + addfd->ret = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); + complete(&addfd->completion); +} + static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) @@ -801,6 +844,7 @@ static int seccomp_do_user_notification(int this_syscall, u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; + struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; @@ -813,6 +857,7 @@ static int seccomp_do_user_notification(int this_syscall, n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add(&n.list, &match->notif->notifications); + INIT_LIST_HEAD(&n.addfd); up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); @@ -821,17 +866,34 @@ static int seccomp_do_user_notification(int this_syscall, /* * This is where we wait for a reply from userspace. */ +wait: err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err == 0) { + /* Check if we were woken up by a addfd message */ + addfd = list_first_entry_or_null(&n.addfd, + struct seccomp_kaddfd, list); + if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + seccomp_handle_addfd(addfd); + mutex_unlock(&match->notify_lock); + goto wait; + } ret = n.val; err = n.error; flags = n.flags; } + /* If there were any pending addfd calls, clear them out */ + list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { + /* The process went away before we got a chance to handle it */ + addfd->ret = -ESRCH; + list_del_init(&addfd->list); + complete(&addfd->completion); + } + /* * Note that it's possible the listener died in between the time when - * we were notified of a respons (or a signal) and when we were able to + * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * @@ -1069,6 +1131,11 @@ static int seccomp_notify_release(struct inode *inode, struct file *file) knotif->error = -ENOSYS; knotif->val = 0; + /* + * We do not need to wake up any pending addfd messages, as + * the notifier will do that for us, as this just looks + * like a standard reply. + */ complete(&knotif->ready); } @@ -1233,12 +1300,109 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter, return ret; } +static long seccomp_notify_addfd(struct seccomp_filter *filter, + struct seccomp_notif_addfd __user *uaddfd, + unsigned int size) +{ + struct seccomp_notif_addfd addfd; + struct seccomp_knotif *knotif; + struct seccomp_kaddfd kaddfd; + int ret; + + BUILD_BUG_ON(sizeof(addfd) < SECCOMP_NOTIFY_ADDFD_SIZE_VER0); + BUILD_BUG_ON(sizeof(addfd) != SECCOMP_NOTIFY_ADDFD_SIZE_LATEST); + + if (size < SECCOMP_NOTIFY_ADDFD_SIZE_VER0 || size >= PAGE_SIZE) + return -EINVAL; + + ret = copy_struct_from_user(&addfd, sizeof(addfd), uaddfd, size); + if (ret) + return ret; + + if (addfd.newfd_flags & ~O_CLOEXEC) + return -EINVAL; + + if (addfd.flags & ~SECCOMP_ADDFD_FLAG_SETFD) + return -EINVAL; + + if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) + return -EINVAL; + + kaddfd.file = fget(addfd.srcfd); + if (!kaddfd.file) + return -EBADF; + + kaddfd.flags = addfd.newfd_flags; + kaddfd.fd = (addfd.flags & SECCOMP_ADDFD_FLAG_SETFD) ? + addfd.newfd : -1; + init_completion(&kaddfd.completion); + + ret = mutex_lock_interruptible(&filter->notify_lock); + if (ret < 0) + goto out; + + knotif = find_notification(filter, addfd.id); + if (!knotif) { + ret = -ENOENT; + goto out_unlock; + } + + /* + * We do not want to allow for FD injection to occur before the + * notification has been picked up by a userspace handler, or after + * the notification has been replied to. + */ + if (knotif->state != SECCOMP_NOTIFY_SENT) { + ret = -EINPROGRESS; + goto out_unlock; + } + + list_add(&kaddfd.list, &knotif->addfd); + complete(&knotif->ready); + mutex_unlock(&filter->notify_lock); + + /* Now we wait for it to be processed or be interrupted */ + ret = wait_for_completion_interruptible(&kaddfd.completion); + if (ret == 0) { + /* + * We had a successful completion. The other side has already + * removed us from the addfd queue, and + * wait_for_completion_interruptible has a memory barrier upon + * success that lets us read this value directly without + * locking. + */ + ret = kaddfd.ret; + goto out; + } + + mutex_lock(&filter->notify_lock); + /* + * Even though we were woken up by a signal and not a successful + * completion, a completion may have happened in the mean time. + * + * We need to check again if the addfd request has been handled, + * and if not, we will remove it from the queue. + */ + if (list_empty(&kaddfd.list)) + ret = kaddfd.ret; + else + list_del(&kaddfd.list); + +out_unlock: + mutex_unlock(&filter->notify_lock); +out: + fput(kaddfd.file); + + return ret; +} + static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seccomp_filter *filter = file->private_data; void __user *buf = (void __user *)arg; + /* Fixed-size ioctls */ switch (cmd) { case SECCOMP_IOCTL_NOTIF_RECV: return seccomp_notify_recv(filter, buf); @@ -1247,6 +1411,13 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd, case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR: case SECCOMP_IOCTL_NOTIF_ID_VALID: return seccomp_notify_id_valid(filter, buf); + } + + /* Extensible Argument ioctls */ +#define EA_IOCTL(cmd) ((cmd) & ~(IOC_INOUT | IOCSIZE_MASK)) + switch (EA_IOCTL(cmd)) { + case EA_IOCTL(SECCOMP_IOCTL_NOTIF_ADDFD): + return seccomp_notify_addfd(filter, buf, _IOC_SIZE(cmd)); default: return -EINVAL; } -- cgit v1.2.3 From c97aedc52dce4c87d4c44de4e6af941cd102600c Mon Sep 17 00:00:00 2001 From: Sargun Dhillon Date: Tue, 2 Jun 2020 18:10:44 -0700 Subject: selftests/seccomp: Test SECCOMP_IOCTL_NOTIF_ADDFD Test whether we can add file descriptors in response to notifications. This injects the file descriptors via notifications, and then uses kcmp to determine whether or not it has been successful. It also includes some basic sanity checking for arguments. Signed-off-by: Sargun Dhillon Cc: Al Viro Cc: Chris Palmer Cc: Christian Brauner Cc: Jann Horn Cc: Kees Cook Cc: Robert Sesek Cc: Tycho Andersen Cc: Matt Denton Cc: linux-fsdevel@vger.kernel.org Cc: linux-kernel@vger.kernel.org Link: https://lore.kernel.org/r/20200603011044.7972-5-sargun@sargun.me Co-developed-by: Kees Cook Signed-off-by: Kees Cook --- tools/testing/selftests/seccomp/seccomp_bpf.c | 229 ++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c index 13b88d578db6..41ee7a73ee07 100644 --- a/tools/testing/selftests/seccomp/seccomp_bpf.c +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -173,7 +174,9 @@ struct seccomp_metadata { #ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER #define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) +#endif +#ifndef SECCOMP_RET_USER_NOTIF #define SECCOMP_RET_USER_NOTIF 0x7fc00000U #define SECCOMP_IOC_MAGIC '!' @@ -209,6 +212,39 @@ struct seccomp_notif_sizes { }; #endif +#ifndef SECCOMP_IOCTL_NOTIF_ADDFD +/* On success, the return value is the remote process's added fd number */ +#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \ + struct seccomp_notif_addfd) + +/* valid flags for seccomp_notif_addfd */ +#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ + +struct seccomp_notif_addfd { + __u64 id; + __u32 flags; + __u32 srcfd; + __u32 newfd; + __u32 newfd_flags; +}; +#endif + +struct seccomp_notif_addfd_small { + __u64 id; + char weird[4]; +}; +#define SECCOMP_IOCTL_NOTIF_ADDFD_SMALL \ + SECCOMP_IOW(3, struct seccomp_notif_addfd_small) + +struct seccomp_notif_addfd_big { + union { + struct seccomp_notif_addfd addfd; + char buf[sizeof(struct seccomp_notif_addfd) + 8]; + }; +}; +#define SECCOMP_IOCTL_NOTIF_ADDFD_BIG \ + SECCOMP_IOWR(3, struct seccomp_notif_addfd_big) + #ifndef PTRACE_EVENTMSG_SYSCALL_ENTRY #define PTRACE_EVENTMSG_SYSCALL_ENTRY 1 #define PTRACE_EVENTMSG_SYSCALL_EXIT 2 @@ -3761,6 +3797,199 @@ TEST(user_notification_filter_empty_threaded) EXPECT_GT((pollfd.revents & POLLHUP) ?: 0, 0); } +TEST(user_notification_addfd) +{ + pid_t pid; + long ret; + int status, listener, memfd, fd; + struct seccomp_notif_addfd addfd = {}; + struct seccomp_notif_addfd_small small = {}; + struct seccomp_notif_addfd_big big = {}; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + /* 100 ms */ + struct timespec delay = { .tv_nsec = 100000000 }; + + memfd = memfd_create("test", 0); + ASSERT_GE(memfd, 0); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Check that the basic notification machinery works */ + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) { + if (syscall(__NR_getppid) != USER_NOTIF_MAGIC) + exit(1); + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); + } + + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + addfd.srcfd = memfd; + addfd.newfd = 0; + addfd.id = req.id; + addfd.flags = 0x0; + + /* Verify bad newfd_flags cannot be set */ + addfd.newfd_flags = ~O_CLOEXEC; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EINVAL); + addfd.newfd_flags = O_CLOEXEC; + + /* Verify bad flags cannot be set */ + addfd.flags = 0xff; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EINVAL); + addfd.flags = 0; + + /* Verify that remote_fd cannot be set without setting flags */ + addfd.newfd = 1; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EINVAL); + addfd.newfd = 0; + + /* Verify small size cannot be set */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_SMALL, &small), -1); + EXPECT_EQ(errno, EINVAL); + + /* Verify we can't send bits filled in unknown buffer area */ + memset(&big, 0xAA, sizeof(big)); + big.addfd = addfd; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big), -1); + EXPECT_EQ(errno, E2BIG); + + + /* Verify we can set an arbitrary remote fd */ + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + /* + * The child has fds 0(stdin), 1(stdout), 2(stderr), 3(memfd), + * 4(listener), so the newly allocated fd should be 5. + */ + EXPECT_EQ(fd, 5); + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); + + /* Verify we can set an arbitrary remote fd with large size */ + memset(&big, 0x0, sizeof(big)); + big.addfd = addfd; + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD_BIG, &big); + EXPECT_EQ(fd, 6); + + /* Verify we can set a specific remote fd */ + addfd.newfd = 42; + addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); + EXPECT_EQ(fd, 42); + EXPECT_EQ(filecmp(getpid(), pid, memfd, fd), 0); + + /* Resume syscall */ + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + /* + * This sets the ID of the ADD FD to the last request plus 1. The + * notification ID increments 1 per notification. + */ + addfd.id = req.id + 1; + + /* This spins until the underlying notification is generated */ + while (ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd) != -1 && + errno != -EINPROGRESS) + nanosleep(&delay, NULL); + + memset(&req, 0, sizeof(req)); + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + ASSERT_EQ(addfd.id, req.id); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + /* Wait for child to finish. */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + close(memfd); +} + +TEST(user_notification_addfd_rlimit) +{ + pid_t pid; + long ret; + int status, listener, memfd; + struct seccomp_notif_addfd addfd = {}; + struct seccomp_notif req = {}; + struct seccomp_notif_resp resp = {}; + const struct rlimit lim = { + .rlim_cur = 0, + .rlim_max = 0, + }; + + memfd = memfd_create("test", 0); + ASSERT_GE(memfd, 0); + + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + ASSERT_EQ(0, ret) { + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); + } + + /* Check that the basic notification machinery works */ + listener = user_notif_syscall(__NR_getppid, + SECCOMP_FILTER_FLAG_NEW_LISTENER); + ASSERT_GE(listener, 0); + + pid = fork(); + ASSERT_GE(pid, 0); + + if (pid == 0) + exit(syscall(__NR_getppid) != USER_NOTIF_MAGIC); + + + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); + + ASSERT_EQ(prlimit(pid, RLIMIT_NOFILE, &lim, NULL), 0); + + addfd.srcfd = memfd; + addfd.newfd_flags = O_CLOEXEC; + addfd.newfd = 0; + addfd.id = req.id; + addfd.flags = 0; + + /* Should probably spot check /proc/sys/fs/file-nr */ + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EMFILE); + + addfd.newfd = 100; + addfd.flags = SECCOMP_ADDFD_FLAG_SETFD; + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd), -1); + EXPECT_EQ(errno, EBADF); + + resp.id = req.id; + resp.error = 0; + resp.val = USER_NOTIF_MAGIC; + + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); + + /* Wait for child to finish. */ + EXPECT_EQ(waitpid(pid, &status, 0), pid); + EXPECT_EQ(true, WIFEXITED(status)); + EXPECT_EQ(0, WEXITSTATUS(status)); + + close(memfd); +} + /* * TODO: * - expand NNP testing -- cgit v1.2.3