From 64bef697d33b75fc06c5789b3f8108680271529f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jan 2024 14:26:02 +0100 Subject: pidfd: implement PIDFD_THREAD flag for pidfd_open() With this flag: - pidfd_open() doesn't require that the target task must be a thread-group leader - pidfd_poll() succeeds when the task exits and becomes a zombie (iow, passes exit_notify()), even if it is a leader and thread-group is not empty. This means that the behaviour of pidfd_poll(PIDFD_THREAD, pid-of-group-leader) is not well defined if it races with exec() from its sub-thread; pidfd_poll() can succeed or not depending on whether pidfd_task_exited() is called before or after exchange_tids(). Perhaps we can improve this behaviour later, pidfd_poll() can probably take sig->group_exec_task into account. But this doesn't really differ from the case when the leader exits before other threads (so pidfd_poll() succeeds) and then another thread execs and pidfd_poll() will block again. thread_group_exited() is no longer used, perhaps it can die. Co-developed-by: Tycho Andersen Signed-off-by: Oleg Nesterov Link: https://lore.kernel.org/r/20240131132602.GA23641@redhat.com Tested-by: Tycho Andersen Reviewed-by: Tycho Andersen Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 5406fbc13074..2e6461459877 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -7,6 +7,7 @@ #include /* Flags for pidfd_open(). */ -#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_NONBLOCK O_NONBLOCK +#define PIDFD_THREAD O_EXCL #endif /* _UAPI_LINUX_PIDFD_H */ -- cgit v1.2.3 From e1fb1dc08e73466830612bcf2f9f72180965c9ba Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 9 Feb 2024 15:49:45 +0100 Subject: pidfd: allow to override signal scope in pidfd_send_signal() Right now we determine the scope of the signal based on the type of pidfd. There are use-cases where it's useful to override the scope of the signal. For example in [1]. Add flags to determine the scope of the signal: (1) PIDFD_SIGNAL_THREAD: send signal to specific thread reference by @pidfd (2) PIDFD_SIGNAL_THREAD_GROUP: send signal to thread-group of @pidfd (2) PIDFD_SIGNAL_PROCESS_GROUP: send signal to process-group of @pidfd Since we now allow specifying PIDFD_SEND_PROCESS_GROUP for pidfd_send_signal() to send signals to process groups we need to adjust the check restricting si_code emulation by userspace to account for PIDTYPE_PGID. Reviewed-by: Oleg Nesterov Link: https://github.com/systemd/systemd/issues/31093 [1] Link: https://lore.kernel.org/r/20240210-chihuahua-hinzog-3945b6abd44a@brauner Link: https://lore.kernel.org/r/20240214123655.GB16265@redhat.com Signed-off-by: Christian Brauner --- include/uapi/linux/pidfd.h | 5 +++++ kernel/signal.c | 46 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pidfd.h b/include/uapi/linux/pidfd.h index 2e6461459877..72ec000a97cd 100644 --- a/include/uapi/linux/pidfd.h +++ b/include/uapi/linux/pidfd.h @@ -10,4 +10,9 @@ #define PIDFD_NONBLOCK O_NONBLOCK #define PIDFD_THREAD O_EXCL +/* Flags for pidfd_send_signal(). */ +#define PIDFD_SIGNAL_THREAD (1UL << 0) +#define PIDFD_SIGNAL_THREAD_GROUP (1UL << 1) +#define PIDFD_SIGNAL_PROCESS_GROUP (1UL << 2) + #endif /* _UAPI_LINUX_PIDFD_H */ diff --git a/kernel/signal.c b/kernel/signal.c index 8b8169623850..bdca529f0f7b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1905,16 +1905,19 @@ int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, return send_sig_info(info.si_signo, &info, t); } -int kill_pgrp(struct pid *pid, int sig, int priv) +static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp) { int ret; - read_lock(&tasklist_lock); - ret = __kill_pgrp_info(sig, __si_special(priv), pid); + ret = __kill_pgrp_info(sig, info, pgrp); read_unlock(&tasklist_lock); - return ret; } + +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} EXPORT_SYMBOL(kill_pgrp); int kill_pid(struct pid *pid, int sig, int priv) @@ -3873,6 +3876,10 @@ static struct pid *pidfd_to_pid(const struct file *file) return tgid_pidfd_to_pid(file); } +#define PIDFD_SEND_SIGNAL_FLAGS \ + (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \ + PIDFD_SIGNAL_PROCESS_GROUP) + /** * sys_pidfd_send_signal - Signal a process through a pidfd * @pidfd: file descriptor of the process @@ -3897,7 +3904,11 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, enum pid_type type; /* Enforce flags be set to 0 until we add an extension. */ - if (flags) + if (flags & ~PIDFD_SEND_SIGNAL_FLAGS) + return -EINVAL; + + /* Ensure that only a single signal scope determining flag is set. */ + if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1) return -EINVAL; f = fdget(pidfd); @@ -3915,10 +3926,24 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, if (!access_pidfd_pidns(pid)) goto err; - if (f.file->f_flags & PIDFD_THREAD) + switch (flags) { + case 0: + /* Infer scope from the type of pidfd. */ + if (f.file->f_flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_THREAD: type = PIDTYPE_PID; - else + break; + case PIDFD_SIGNAL_THREAD_GROUP: type = PIDTYPE_TGID; + break; + case PIDFD_SIGNAL_PROCESS_GROUP: + type = PIDTYPE_PGID; + break; + } if (info) { ret = copy_siginfo_from_user_any(&kinfo, info); @@ -3931,14 +3956,17 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig, /* Only allow sending arbitrary signals to yourself. */ ret = -EPERM; - if ((task_pid(current) != pid) && + if ((task_pid(current) != pid || type > PIDTYPE_TGID) && (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) goto err; } else { prepare_kill_siginfo(sig, &kinfo, type); } - ret = kill_pid_info_type(sig, &kinfo, pid, type); + if (type == PIDTYPE_PGID) + ret = kill_pgrp_info(sig, &kinfo, pid); + else + ret = kill_pid_info_type(sig, &kinfo, pid, type); err: fdput(f); return ret; -- cgit v1.2.3 From cb12fd8e0dabb9a1c8aef55a6a41e2c255fcdf4b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 12 Feb 2024 16:32:38 +0100 Subject: pidfd: add pidfs This moves pidfds from the anonymous inode infrastructure to a tiny pseudo filesystem. This has been on my todo for quite a while as it will unblock further work that we weren't able to do simply because of the very justified limitations of anonymous inodes. Moving pidfds to a tiny pseudo filesystem allows: * statx() on pidfds becomes useful for the first time. * pidfds can be compared simply via statx() and then comparing inode numbers. * pidfds have unique inode numbers for the system lifetime. * struct pid is now stashed in inode->i_private instead of file->private_data. This means it is now possible to introduce concepts that operate on a process once all file descriptors have been closed. A concrete example is kill-on-last-close. * file->private_data is freed up for per-file options for pidfds. * Each struct pid will refer to a different inode but the same struct pid will refer to the same inode if it's opened multiple times. In contrast to now where each struct pid refers to the same inode. Even if we were to move to anon_inode_create_getfile() which creates new inodes we'd still be associating the same struct pid with multiple different inodes. The tiny pseudo filesystem is not visible anywhere in userspace exactly like e.g., pipefs and sockfs. There's no lookup, there's no complex inode operations, nothing. Dentries and inodes are always deleted when the last pidfd is closed. We allocate a new inode for each struct pid and we reuse that inode for all pidfds. We use iget_locked() to find that inode again based on the inode number which isn't recycled. We allocate a new dentry for each pidfd that uses the same inode. That is similar to anonymous inodes which reuse the same inode for thousands of dentries. For pidfds we're talking way less than that. There usually won't be a lot of concurrent openers of the same struct pid. They can probably often be counted on two hands. I know that systemd does use separate pidfd for the same struct pid for various complex process tracking issues. So I think with that things actually become way simpler. Especially because we don't have to care about lookup. Dentries and inodes continue to be always deleted. The code is entirely optional and fairly small. If it's not selected we fallback to anonymous inodes. Heavily inspired by nsfs which uses a similar stashing mechanism just for namespaces. Link: https://lore.kernel.org/r/20240213-vfs-pidfd_fs-v1-2-f863f58cfce1@kernel.org Signed-off-by: Christian Brauner --- fs/Kconfig | 7 ++ fs/pidfs.c | 156 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/pid.h | 5 +- include/linux/pidfs.h | 8 +++ include/uapi/linux/magic.h | 1 + init/main.c | 2 + kernel/fork.c | 13 +--- kernel/nsproxy.c | 2 +- kernel/pid.c | 11 ++++ 9 files changed, 188 insertions(+), 17 deletions(-) create mode 100644 include/linux/pidfs.h (limited to 'include/uapi') diff --git a/fs/Kconfig b/fs/Kconfig index 89fdbefd1075..f3dbd84a0e40 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -174,6 +174,13 @@ source "fs/proc/Kconfig" source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" +config FS_PID + bool "Pseudo filesystem for process file descriptors" + depends on 64BIT + default y + help + Pidfs implements advanced features for process file descriptors. + config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM diff --git a/fs/pidfs.c b/fs/pidfs.c index eccb291862a0..6c3f010074af 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include #include #include #include #include +#include #include #include #include @@ -14,10 +16,12 @@ static int pidfd_release(struct inode *inode, struct file *file) { +#ifndef CONFIG_FS_PID struct pid *pid = file->private_data; file->private_data = NULL; put_pid(pid); +#endif return 0; } @@ -59,7 +63,7 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid *pid = f->private_data; + struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; @@ -93,7 +97,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { - struct pid *pid = file->private_data; + struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; @@ -113,10 +117,156 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) return poll_flags; } -const struct file_operations pidfd_fops = { +static const struct file_operations pidfs_file_operations = { .release = pidfd_release, .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif }; + +struct pid *pidfd_pid(const struct file *file) +{ + if (file->f_op != &pidfs_file_operations) + return ERR_PTR(-EBADF); +#ifdef CONFIG_FS_PID + return file_inode(file)->i_private; +#else + return file->private_data; +#endif +} + +#ifdef CONFIG_FS_PID +static struct vfsmount *pidfs_mnt __ro_after_init; +static struct super_block *pidfs_sb __ro_after_init; + +/* + * The vfs falls back to simple_setattr() if i_op->setattr() isn't + * implemented. Let's reject it completely until we have a clean + * permission concept for pidfds. + */ +static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + return 0; +} + +static const struct inode_operations pidfs_inode_operations = { + .getattr = pidfs_getattr, + .setattr = pidfs_setattr, +}; + +static void pidfs_evict_inode(struct inode *inode) +{ + struct pid *pid = inode->i_private; + + clear_inode(inode); + put_pid(pid); +} + +static const struct super_operations pidfs_sops = { + .drop_inode = generic_delete_inode, + .evict_inode = pidfs_evict_inode, + .statfs = simple_statfs, +}; + +static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) +{ + return dynamic_dname(buffer, buflen, "pidfd:[%lu]", + d_inode(dentry)->i_ino); +} + +static const struct dentry_operations pidfs_dentry_operations = { + .d_delete = always_delete_dentry, + .d_dname = pidfs_dname, +}; + +static int pidfs_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx; + + ctx = init_pseudo(fc, PID_FS_MAGIC); + if (!ctx) + return -ENOMEM; + + ctx->ops = &pidfs_sops; + ctx->dops = &pidfs_dentry_operations; + return 0; +} + +static struct file_system_type pidfs_type = { + .name = "pidfs", + .init_fs_context = pidfs_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + + struct inode *inode; + struct file *pidfd_file; + + inode = iget_locked(pidfs_sb, pid->ino); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (inode->i_state & I_NEW) { + /* + * Inode numbering for pidfs start at RESERVED_PIDS + 1. + * This avoids collisions with the root inode which is 1 + * for pseudo filesystems. + */ + inode->i_ino = pid->ino; + inode->i_mode = S_IFREG | S_IRUGO; + inode->i_op = &pidfs_inode_operations; + inode->i_fop = &pidfs_file_operations; + inode->i_flags |= S_IMMUTABLE; + inode->i_private = get_pid(pid); + simple_inode_init_ts(inode); + unlock_new_inode(inode); + } + + pidfd_file = alloc_file_pseudo(inode, pidfs_mnt, "", flags, + &pidfs_file_operations); + if (IS_ERR(pidfd_file)) + iput(inode); + + return pidfd_file; +} + +void __init pidfs_init(void) +{ + pidfs_mnt = kern_mount(&pidfs_type); + if (IS_ERR(pidfs_mnt)) + panic("Failed to mount pidfs pseudo filesystem"); + + pidfs_sb = pidfs_mnt->mnt_sb; +} + +#else /* !CONFIG_FS_PID */ + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) +{ + struct file *pidfd_file; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfs_file_operations, pid, + flags | O_RDWR); + if (IS_ERR(pidfd_file)) + return pidfd_file; + + get_pid(pid); + return pidfd_file; +} + +void __init pidfs_init(void) { } +#endif diff --git a/include/linux/pid.h b/include/linux/pid.h index 8124d57752b9..956481128e8d 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -55,6 +55,9 @@ struct pid refcount_t count; unsigned int level; spinlock_t lock; +#ifdef CONFIG_FS_PID + unsigned long ino; +#endif /* lists of tasks that use this pid */ struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head inodes; @@ -66,8 +69,6 @@ struct pid extern struct pid init_struct_pid; -extern const struct file_operations pidfd_fops; - struct file; struct pid *pidfd_pid(const struct file *file); diff --git a/include/linux/pidfs.h b/include/linux/pidfs.h new file mode 100644 index 000000000000..75bdf9807802 --- /dev/null +++ b/include/linux/pidfs.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PID_FS_H +#define _LINUX_PID_FS_H + +struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); +void __init pidfs_init(void); + +#endif /* _LINUX_PID_FS_H */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 6325d1d0e90f..1b40a968ba91 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -101,5 +101,6 @@ #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */ #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ +#define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/main.c b/init/main.c index e24b0780fdff..2fbf6a3114d5 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include @@ -1059,6 +1060,7 @@ void start_kernel(void) seq_file_init(); proc_root_init(); nsfs_init(); + pidfs_init(); cpuset_init(); cgroup_init(); taskstats_init_early(); diff --git a/kernel/fork.c b/kernel/fork.c index 662a61f340ce..2f839c290dcf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include @@ -1985,14 +1986,6 @@ static inline void rcu_copy_process(struct task_struct *p) #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } -struct pid *pidfd_pid(const struct file *file) -{ - if (file->f_op == &pidfd_fops) - return file->private_data; - - return ERR_PTR(-EBADF); -} - /** * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd @@ -2030,13 +2023,11 @@ static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **re if (pidfd < 0) return pidfd; - pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - flags | O_RDWR); + pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfd_file)) { put_unused_fd(pidfd); return PTR_ERR(pidfd_file); } - get_pid(pid); /* held by pidfd_file now */ /* * anon_inode_getfile() ignores everything outside of the * O_ACCMODE | O_NONBLOCK mask, set PIDFD_THREAD manually. diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 15781acaac1c..6ec3deec68c2 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -573,7 +573,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, flags) if (proc_ns_file(f.file)) err = validate_ns(&nsset, ns); else - err = validate_nsset(&nsset, f.file->private_data); + err = validate_nsset(&nsset, pidfd_pid(f.file)); if (!err) { commit_nsset(&nsset); perf_event_namespaces(current); diff --git a/kernel/pid.c b/kernel/pid.c index c1d940fbd314..581cc34341fd 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,13 @@ int pid_max = PID_MAX_DEFAULT; int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; +#ifdef CONFIG_FS_PID +/* + * Pseudo filesystems start inode numbering after one. We use Reserved + * PIDs as a natural offset. + */ +static u64 pidfs_ino = RESERVED_PIDS; +#endif /* * PID-map pages start out as NULL, they get allocated upon @@ -272,6 +280,9 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; +#ifdef CONFIG_FS_PID + pid->ino = ++pidfs_ino; +#endif for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); -- cgit v1.2.3