diff options
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/base.c | 118 | ||||
-rw-r--r-- | fs/proc/consoles.c | 7 | ||||
-rw-r--r-- | fs/proc/fd.c | 18 | ||||
-rw-r--r-- | fs/proc/generic.c | 10 | ||||
-rw-r--r-- | fs/proc/inode.c | 31 | ||||
-rw-r--r-- | fs/proc/internal.h | 45 | ||||
-rw-r--r-- | fs/proc/kcore.c | 2 | ||||
-rw-r--r-- | fs/proc/page.c | 53 | ||||
-rw-r--r-- | fs/proc/proc_sysctl.c | 81 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 496 |
10 files changed, 630 insertions, 231 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c index 72a1acd03675..b31283d81c52 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -85,6 +85,7 @@ #include <linux/elf.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> +#include <linux/fs_parser.h> #include <linux/fs_struct.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> @@ -117,6 +118,40 @@ static u8 nlink_tid __ro_after_init; static u8 nlink_tgid __ro_after_init; +enum proc_mem_force { + PROC_MEM_FORCE_ALWAYS, + PROC_MEM_FORCE_PTRACE, + PROC_MEM_FORCE_NEVER +}; + +static enum proc_mem_force proc_mem_force_override __ro_after_init = + IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER : + IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE : + PROC_MEM_FORCE_ALWAYS; + +static const struct constant_table proc_mem_force_table[] __initconst = { + { "always", PROC_MEM_FORCE_ALWAYS }, + { "ptrace", PROC_MEM_FORCE_PTRACE }, + { "never", PROC_MEM_FORCE_NEVER }, + { } +}; + +static int __init early_proc_mem_force_override(char *buf) +{ + if (!buf) + return -EINVAL; + + /* + * lookup_constant() defaults to proc_mem_force_override to preseve + * the initial Kconfig choice in case an invalid param gets passed. + */ + proc_mem_force_override = lookup_constant(proc_mem_force_table, + buf, proc_mem_force_override); + + return 0; +} +early_param("proc_mem.force_override", early_proc_mem_force_override); + struct pid_entry { const char *name; unsigned int len; @@ -827,12 +862,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) static int mem_open(struct inode *inode, struct file *file) { - int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); - - /* OK to pass negative loff_t, we can catch out-of-range */ - file->f_mode |= FMODE_UNSIGNED_OFFSET; + if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET))) + return -EINVAL; + return __mem_open(inode, file, PTRACE_MODE_ATTACH); +} - return ret; +static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm) +{ + struct task_struct *task; + bool ptrace_active = false; + + switch (proc_mem_force_override) { + case PROC_MEM_FORCE_NEVER: + return false; + case PROC_MEM_FORCE_PTRACE: + task = get_proc_task(file_inode(file)); + if (task) { + ptrace_active = READ_ONCE(task->ptrace) && + READ_ONCE(task->mm) == mm && + READ_ONCE(task->parent) == current; + put_task_struct(task); + } + return ptrace_active; + default: + return true; + } } static ssize_t mem_rw(struct file *file, char __user *buf, @@ -855,7 +909,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - flags = FOLL_FORCE | (write ? FOLL_WRITE : 0); + flags = write ? FOLL_WRITE : 0; + if (proc_mem_foll_force(file, mm)) + flags |= FOLL_FORCE; while (count > 0) { size_t this_len = min_t(size_t, count, PAGE_SIZE); @@ -932,6 +988,7 @@ static const struct file_operations proc_mem_operations = { .write = mem_write, .open = mem_open, .release = mem_release, + .fop_flags = FOP_UNSIGNED_OFFSET, }; static int environ_open(struct inode *inode, struct file *file) @@ -2276,8 +2333,8 @@ proc_map_files_instantiate(struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - d_set_d_op(dentry, &tid_map_files_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_map_files_dentry_operations); } static struct dentry *proc_map_files_lookup(struct inode *dir, @@ -2456,13 +2513,13 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->sighand) return ERR_PTR(-ESRCH); - return seq_list_start(&tp->task->signal->posix_timers, *pos); + return seq_hlist_start(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_list_next(v, &tp->task->signal->posix_timers, pos); + return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) @@ -2491,7 +2548,7 @@ static int show_timer(struct seq_file *m, void *v) [SIGEV_THREAD] = "thread", }; - timer = list_entry((struct list_head *)v, struct k_itimer, list); + timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); @@ -2569,10 +2626,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf, } task_lock(p); - if (slack_ns == 0) - p->timer_slack_ns = p->default_timer_slack_ns; - else - p->timer_slack_ns = slack_ns; + if (rt_or_dl_task_policy(p)) + slack_ns = 0; + else if (slack_ns == 0) + slack_ns = p->default_timer_slack_ns; + p->timer_slack_ns = slack_ns; task_unlock(p); out: @@ -3870,12 +3928,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. + /* We cache the tgid value that the last readdir call couldn't + * return and lseek resets it to 0. */ ns = proc_pid_ns(inode->i_sb); - tid = (int)file->f_version; - file->f_version = 0; + tid = (int)(intptr_t)file->private_data; + file->private_data = NULL; for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns); task; task = next_tid(task), ctx->pos++) { @@ -3890,7 +3948,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) proc_task_instantiate, task, NULL)) { /* returning this tgid failed, save it as the first * pid for the next readir call */ - file->f_version = (u64)tid; + file->private_data = (void *)(intptr_t)tid; put_task_struct(task); break; } @@ -3915,6 +3973,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap, return 0; } +/* + * proc_task_readdir() set @file->private_data to a positive integer + * value, so casting that to u64 is safe. generic_llseek_cookie() will + * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is + * here to catch any unexpected change in behavior either in + * proc_task_readdir() or generic_llseek_cookie(). + */ +static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence) +{ + u64 cookie = (u64)(intptr_t)file->private_data; + loff_t off; + + off = generic_llseek_cookie(file, offset, whence, &cookie); + WARN_ON_ONCE(cookie > INT_MAX); + file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */ + return off; +} + static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, @@ -3925,7 +4001,7 @@ static const struct inode_operations proc_task_inode_operations = { static const struct file_operations proc_task_operations = { .read = generic_read_dir, .iterate_shared = proc_task_readdir, - .llseek = generic_file_llseek, + .llseek = proc_dir_llseek, }; void __init set_proc_pid_nlink(void) diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c index e0758fe7936d..b7cab1ad990d 100644 --- a/fs/proc/consoles.c +++ b/fs/proc/consoles.c @@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v) { CON_ENABLED, 'E' }, { CON_CONSDEV, 'C' }, { CON_BOOT, 'B' }, + { CON_NBCON, 'N' }, { CON_PRINTBUFFER, 'p' }, { CON_BRL, 'b' }, { CON_ANYTIME, 'a' }, @@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v) seq_printf(m, "%s%d", con->name, con->index); seq_pad(m, ' '); seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-', - con->write ? 'W' : '-', con->unblank ? 'U' : '-', - flags); + ((con->flags & CON_NBCON) || con->write) ? 'W' : '-', + con->unblank ? 'U' : '-', flags); if (dev) seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev)); @@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v) } static void *c_start(struct seq_file *m, loff_t *pos) + __acquires(&console_mutex) { struct console *con; loff_t off = 0; @@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos) } static void c_stop(struct seq_file *m, void *v) + __releases(&console_mutex) { console_list_unlock(); } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 586bbc84ca04..1f54a54bfb91 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v) real_mount(file->f_path.mnt)->mnt_id, file_inode(file)->i_ino); - /* show_fd_locks() never deferences files so a stale value is safe */ + /* show_fd_locks() never dereferences files, so a stale value is safe */ show_fd_locks(m, file, files); if (seq_has_overflowed(m)) goto out; @@ -220,8 +220,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry, ei->op.proc_get_link = proc_fd_link; tid_fd_update_inode(task, inode, data->mode); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry *proc_lookupfd_common(struct inode *dir, @@ -312,14 +312,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count) return 0; } -static int proc_readfd(struct file *file, struct dir_context *ctx) +static int proc_fd_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fd_instantiate); } const struct file_operations proc_fd_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfd, + .iterate_shared = proc_fd_iterate, .llseek = generic_file_llseek, }; @@ -397,8 +397,8 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry, inode->i_fop = &proc_fdinfo_file_operations; tid_fd_update_inode(task, inode, 0); - d_set_d_op(dentry, &tid_fd_dentry_operations); - return d_splice_alias(inode, dentry); + return proc_splice_unmountable(inode, dentry, + &tid_fd_dentry_operations); } static struct dentry * @@ -407,7 +407,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags) return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate); } -static int proc_readfdinfo(struct file *file, struct dir_context *ctx) +static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx) { return proc_readfd_common(file, ctx, proc_fdinfo_instantiate); @@ -421,6 +421,6 @@ const struct inode_operations proc_fdinfo_inode_operations = { const struct file_operations proc_fdinfo_operations = { .read = generic_read_dir, - .iterate_shared = proc_readfdinfo, + .iterate_shared = proc_fdinfo_iterate, .llseek = generic_file_llseek, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 775ce0bcf08c..dbe82cf23ee4 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum) { int i; - i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1, - GFP_KERNEL); + i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST, + GFP_KERNEL); if (i < 0) return i; @@ -213,7 +213,7 @@ int proc_alloc_inum(unsigned int *inum) void proc_free_inum(unsigned int inum) { - ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); + ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); } static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags) @@ -464,9 +464,9 @@ struct proc_dir_entry *proc_symlink(const char *name, (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1); if (ent) { - ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL); + ent->size = strlen(dest); + ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL); if (ent->data) { - strcpy((char*)ent->data,dest); ent->proc_iops = &proc_link_inode_operations; ent = proc_register(parent, ent); } else { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d19434e2a58e..626ad7bd94f2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -303,9 +303,7 @@ static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter) static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos) { - typeof_member(struct proc_ops, proc_read) read; - - read = pde->proc_ops->proc_read; + __auto_type read = pde->proc_ops->proc_read; if (read) return read(file, buf, count, ppos); return -EIO; @@ -327,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - typeof_member(struct proc_ops, proc_write) write; - - write = pde->proc_ops->proc_write; + __auto_type write = pde->proc_ops->proc_write; if (write) return write(file, buf, count, ppos); return -EIO; @@ -351,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts) { - typeof_member(struct proc_ops, proc_poll) poll; - - poll = pde->proc_ops->proc_poll; + __auto_type poll = pde->proc_ops->proc_poll; if (poll) return poll(file, pts); return DEFAULT_POLLMASK; @@ -375,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts) static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { - typeof_member(struct proc_ops, proc_ioctl) ioctl; - - ioctl = pde->proc_ops->proc_ioctl; + __auto_type ioctl = pde->proc_ops->proc_ioctl; if (ioctl) return ioctl(file, cmd, arg); return -ENOTTY; @@ -400,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne #ifdef CONFIG_COMPAT static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg) { - typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl; - - compat_ioctl = pde->proc_ops->proc_compat_ioctl; + __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl; if (compat_ioctl) return compat_ioctl(file, cmd, arg); return -ENOTTY; @@ -424,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma) { - typeof_member(struct proc_ops, proc_mmap) mmap; - - mmap = pde->proc_ops->proc_mmap; + __auto_type mmap = pde->proc_ops->proc_mmap; if (mmap) return mmap(file, vma); return -EIO; @@ -483,7 +471,6 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct proc_dir_entry *pde = PDE(inode); int rv = 0; typeof_member(struct proc_ops, proc_open) open; - typeof_member(struct proc_ops, proc_release) release; struct pde_opener *pdeo; if (!pde->proc_ops->proc_lseek) @@ -510,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!use_pde(pde)) return -ENOENT; - release = pde->proc_ops->proc_release; + __auto_type release = pde->proc_ops->proc_release; if (release) { pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL); if (!pdeo) { @@ -547,9 +534,7 @@ static int proc_reg_release(struct inode *inode, struct file *file) struct pde_opener *pdeo; if (pde_is_permanent(pde)) { - typeof_member(struct proc_ops, proc_release) release; - - release = pde->proc_ops->proc_release; + __auto_type release = pde->proc_ops->proc_release; if (release) { return release(inode, file); } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a71ac5379584..87e4d6282025 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -13,6 +13,7 @@ #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> +#include <linux/mm.h> struct ctl_table_header; struct mempolicy; @@ -142,6 +143,37 @@ unsigned name_to_int(const struct qstr *qstr); /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 +/** + * folio_precise_page_mapcount() - Number of mappings of this folio page. + * @folio: The folio. + * @page: The page. + * + * The number of present user page table entries that reference this page + * as tracked via the RMAP: either referenced directly (PTE) or as part of + * a larger area that covers this page (e.g., PMD). + * + * Use this function only for the calculation of existing statistics + * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount). + * + * Do not add new users. + * + * Returns: The number of mappings of this folio page. 0 for + * folios that are not mapped to user space or are not tracked via the RMAP + * (e.g., shared zeropage). + */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + int mapcount = atomic_read(&page->_mapcount) + 1; + + if (page_mapcount_is_type(mapcount)) + mapcount = 0; + if (folio_test_large(folio)) + mapcount += folio_entire_mapcount(folio); + + return mapcount; +} + /* * array.c */ @@ -316,3 +348,16 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde) /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; } + +/* + * Add a new procfs dentry that can't serve as a mountpoint. That should + * encompass anything that is ephemeral and can just disappear while the + * process is still around. + */ +static inline struct dentry *proc_splice_unmountable(struct inode *inode, + struct dentry *dentry, const struct dentry_operations *d_ops) +{ + d_set_d_op(dentry, d_ops); + dont_mount(dentry); + return d_splice_alias(inode, dentry); +} diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 8e08a9a1b7ed..7d0acdad74e2 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list) int nid, ret; unsigned long end_pfn; - /* Not inialized....update now */ + /* Not initialized....update now */ /* find out "max pfn" */ end_pfn = 0; for_each_node_state(nid, N_MEMORY) { diff --git a/fs/proc/page.c b/fs/proc/page.c index 2fb64bdb64eb..a55f5acefa97 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -37,21 +37,19 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page counts +/* /proc/kpagecount - an array exposing page mapcounts * * Each entry is a u64 representing the corresponding - * physical page count. + * physical page mapcount. */ static ssize_t kpagecount_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; - struct page *ppage; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; - u64 pcount; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -61,18 +59,19 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { + struct page *page; + u64 mapcount = 0; + /* * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ - ppage = pfn_to_online_page(pfn); + page = pfn_to_online_page(pfn); + if (page) + mapcount = folio_precise_page_mapcount(page_folio(page), + page); - if (!ppage) - pcount = 0; - else - pcount = page_mapcount(ppage); - - if (put_user(pcount, out)) { + if (put_user(mapcount, out)) { ret = -EFAULT; break; } @@ -148,19 +147,16 @@ u64 stable_page_flags(const struct page *page) u |= 1 << KPF_COMPOUND_TAIL; if (folio_test_hugetlb(folio)) u |= 1 << KPF_HUGE; - /* - * We need to check PageLRU/PageAnon - * to make sure a given page is a thp, not a non-huge compound page. - */ - else if (folio_test_large(folio)) { - if ((k & (1 << PG_lru)) || is_anon) - u |= 1 << KPF_THP; - else if (is_huge_zero_folio(folio)) { - u |= 1 << KPF_ZERO_PAGE; - u |= 1 << KPF_THP; - } - } else if (is_zero_pfn(page_to_pfn(page))) + else if (folio_test_large(folio) && + folio_test_large_rmappable(folio)) { + /* Note: we indicate any THPs here, not just PMD-sized ones */ + u |= 1 << KPF_THP; + } else if (is_huge_zero_folio(folio)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } else if (is_zero_folio(folio)) { u |= 1 << KPF_ZERO_PAGE; + } /* * Caveats on high order pages: PG_buddy and PG_slab will only be set @@ -186,7 +182,6 @@ u64 stable_page_flags(const struct page *page) #endif u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); @@ -211,18 +206,16 @@ u64 stable_page_flags(const struct page *page) u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison); #endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); -#endif - u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); - u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2); u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); -#ifdef CONFIG_ARCH_USES_PG_ARCH_X +#ifdef CONFIG_ARCH_USES_PG_ARCH_2 u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); +#endif +#ifdef CONFIG_ARCH_USES_PG_ARCH_3 u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3); #endif diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b1c2c0b82116..d11ebc055ce0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -21,7 +21,7 @@ #define list_for_each_table_entry(entry, header) \ entry = header->ctl_table; \ - for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++) + for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++) static const struct dentry_operations proc_sys_dentry_operations; static const struct file_operations proc_sys_file_operations; @@ -29,8 +29,13 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; -/* Support for permanently empty directories */ -static struct ctl_table sysctl_mount_point[] = { }; +/* + * Support for permanently empty directories. + * Must be non-empty to avoid sharing an address with other tables. + */ +static struct ctl_table sysctl_mount_point[] = { + { } +}; /** * register_sysctl_mount_point() - registers a sysctl mount point @@ -42,7 +47,7 @@ static struct ctl_table sysctl_mount_point[] = { }; */ struct ctl_table_header *register_sysctl_mount_point(const char *path) { - return register_sysctl(path, sysctl_mount_point); + return register_sysctl_sz(path, sysctl_mount_point, 0); } EXPORT_SYMBOL(register_sysctl_mount_point); @@ -476,12 +481,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, make_empty_dir_inode(inode); } + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; if (root->set_ownership) root->set_ownership(head, &inode->i_uid, &inode->i_gid); - else { - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - } return inode; } @@ -951,14 +954,14 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set, char *new_name; new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) + - sizeof(struct ctl_table)*2 + namelen + 1, + sizeof(struct ctl_table) + namelen + 1, GFP_KERNEL); if (!new) return NULL; node = (struct ctl_node *)(new + 1); table = (struct ctl_table *)(node + 1); - new_name = (char *)(table + 2); + new_name = (char *)(table + 1); memcpy(new_name, name, namelen); table[0].procname = new_name; table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO; @@ -1093,6 +1096,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...) static int sysctl_check_table_array(const char *path, struct ctl_table *table) { + unsigned int extra; int err = 0; if ((table->proc_handler == proc_douintvec) || @@ -1104,6 +1108,19 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table) if (table->proc_handler == proc_dou8vec_minmax) { if (table->maxlen != sizeof(u8)) err |= sysctl_err(path, table, "array not allowed"); + + if (table->extra1) { + extra = *(unsigned int *) table->extra1; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } + if (table->extra2) { + extra = *(unsigned int *) table->extra2; + if (extra > 255U) + err |= sysctl_err(path, table, + "range value too large for proc_dou8vec_minmax"); + } } if (table->proc_handler == proc_dobool) { @@ -1119,6 +1136,8 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header) struct ctl_table *entry; int err = 0; list_for_each_table_entry(entry, header) { + if (!entry->procname) + err |= sysctl_err(path, entry, "procname is null"); if ((entry->proc_handler == proc_dostring) || (entry->proc_handler == proc_dobool) || (entry->proc_handler == proc_dointvec) || @@ -1154,18 +1173,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_ struct ctl_table_header *links; struct ctl_node *node; char *link_name; - int nr_entries, name_bytes; + int name_bytes; name_bytes = 0; - nr_entries = 0; list_for_each_table_entry(entry, head) { - nr_entries++; name_bytes += strlen(entry->procname) + 1; } links = kzalloc(sizeof(struct ctl_table_header) + - sizeof(struct ctl_node)*nr_entries + - sizeof(struct ctl_table)*(nr_entries + 1) + + sizeof(struct ctl_node)*head->ctl_table_size + + sizeof(struct ctl_table)*head->ctl_table_size + name_bytes, GFP_KERNEL); @@ -1173,8 +1190,8 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_ return NULL; node = (struct ctl_node *)(links + 1); - link_table = (struct ctl_table *)(node + nr_entries); - link_name = (char *)&link_table[nr_entries + 1]; + link_table = (struct ctl_table *)(node + head->ctl_table_size); + link_name = (char *)(link_table + head->ctl_table_size); link = link_table; list_for_each_table_entry(entry, head) { @@ -1188,7 +1205,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_ } init_header(links, dir->header.root, dir->header.set, node, link_table, head->ctl_table_size); - links->nreg = nr_entries; + links->nreg = head->ctl_table_size; return links; } @@ -1300,28 +1317,23 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) * __register_sysctl_table - register a leaf sysctl table * @set: Sysctl tree to register on * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure without any child. This table - * should not be free'd after registration. So it should not be - * used on stack. It can either be a global or dynamically allocated - * by the caller and free'd later after sysctl unregistration. + * + * @table: the top-level table structure. This table should not be free'd + * after registration. So it should not be used on stack. It can either + * be a global or dynamically allocated by the caller and free'd later + * after sysctl unregistration. * @table_size : The number of elements in table * * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. + * array. * * The members of the &struct ctl_table structure are used as follows: - * * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file - * - * child - must be %NULL. - * + * data - a pointer to data for use by proc_handler + * maxlen - the maximum size in bytes of the data + * mode - the file permissions for the /proc/sys file + * type - Defines the target type (described in struct definition) * proc_handler - the text handler routine (described below) * * extra1, extra2 - extra pointers usable by the proc handler routines @@ -1329,8 +1341,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path) * [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org * * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes (where child is not NULL) are not allowed, - * sysctl_check_table() verifies this. + * under /proc; non-leaf nodes are not allowed. * * There must be a proc_handler routine for any terminal nodes. * Several default handlers are available to cover common cases - diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f8d35f993fe5..72f14fd59c2d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -22,6 +22,7 @@ #include <linux/pkeys.h> #include <linux/minmax.h> #include <linux/overflow.h> +#include <linux/buildid.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -239,6 +240,67 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } +static void get_vma_name(struct vm_area_struct *vma, + const struct path **path, + const char **name, + const char **name_fmt) +{ + struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL; + + *name = NULL; + *path = NULL; + *name_fmt = NULL; + + /* + * Print the dentry name for named mappings, and a + * special [heap] marker for the heap: + */ + if (vma->vm_file) { + /* + * If user named this anon shared memory via + * prctl(PR_SET_VMA ..., use the provided name. + */ + if (anon_name) { + *name_fmt = "[anon_shmem:%s]"; + *name = anon_name->name; + } else { + *path = file_user_path(vma->vm_file); + } + return; + } + + if (vma->vm_ops && vma->vm_ops->name) { + *name = vma->vm_ops->name(vma); + if (*name) + return; + } + + *name = arch_vma_name(vma); + if (*name) + return; + + if (!vma->vm_mm) { + *name = "[vdso]"; + return; + } + + if (vma_is_initial_heap(vma)) { + *name = "[heap]"; + return; + } + + if (vma_is_initial_stack(vma)) { + *name = "[stack]"; + return; + } + + if (anon_name) { + *name_fmt = "[anon:%s]"; + *name = anon_name->name; + return; + } +} + static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -262,17 +324,15 @@ static void show_vma_header_prefix(struct seq_file *m, static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { - struct anon_vma_name *anon_name = NULL; - struct mm_struct *mm = vma->vm_mm; - struct file *file = vma->vm_file; + const struct path *path; + const char *name_fmt, *name; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; unsigned long start, end; dev_t dev = 0; - const char *name = NULL; - if (file) { + if (vma->vm_file) { const struct inode *inode = file_user_inode(vma->vm_file); dev = inode->i_sb->s_dev; @@ -283,57 +343,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) start = vma->vm_start; end = vma->vm_end; show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); - if (mm) - anon_name = anon_vma_name(vma); - /* - * Print the dentry name for named mappings, and a - * special [heap] marker for the heap: - */ - if (file) { + get_vma_name(vma, &path, &name, &name_fmt); + if (path) { seq_pad(m, ' '); - /* - * If user named this anon shared memory via - * prctl(PR_SET_VMA ..., use the provided name. - */ - if (anon_name) - seq_printf(m, "[anon_shmem:%s]", anon_name->name); - else - seq_path(m, file_user_path(file), "\n"); - goto done; - } - - if (vma->vm_ops && vma->vm_ops->name) { - name = vma->vm_ops->name(vma); - if (name) - goto done; - } - - name = arch_vma_name(vma); - if (!name) { - if (!mm) { - name = "[vdso]"; - goto done; - } - - if (vma_is_initial_heap(vma)) { - name = "[heap]"; - goto done; - } - - if (vma_is_initial_stack(vma)) { - name = "[stack]"; - goto done; - } - - if (anon_name) { - seq_pad(m, ' '); - seq_printf(m, "[anon:%s]", anon_name->name); - } - } - -done: - if (name) { + seq_path(m, path, "\n"); + } else if (name_fmt) { + seq_pad(m, ' '); + seq_printf(m, name_fmt, name); + } else if (name) { seq_pad(m, ' '); seq_puts(m, name); } @@ -358,11 +376,253 @@ static int pid_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } +#define PROCMAP_QUERY_VMA_FLAGS ( \ + PROCMAP_QUERY_VMA_READABLE | \ + PROCMAP_QUERY_VMA_WRITABLE | \ + PROCMAP_QUERY_VMA_EXECUTABLE | \ + PROCMAP_QUERY_VMA_SHARED \ +) + +#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \ + PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \ + PROCMAP_QUERY_FILE_BACKED_VMA | \ + PROCMAP_QUERY_VMA_FLAGS \ +) + +static int query_vma_setup(struct mm_struct *mm) +{ + return mmap_read_lock_killable(mm); +} + +static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma) +{ + mmap_read_unlock(mm); +} + +static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr) +{ + return find_vma(mm, addr); +} + +static struct vm_area_struct *query_matching_vma(struct mm_struct *mm, + unsigned long addr, u32 flags) +{ + struct vm_area_struct *vma; + +next_vma: + vma = query_vma_find_by_addr(mm, addr); + if (!vma) + goto no_vma; + + /* user requested only file-backed VMA, keep iterating */ + if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file) + goto skip_vma; + + /* VMA permissions should satisfy query flags */ + if (flags & PROCMAP_QUERY_VMA_FLAGS) { + u32 perm = 0; + + if (flags & PROCMAP_QUERY_VMA_READABLE) + perm |= VM_READ; + if (flags & PROCMAP_QUERY_VMA_WRITABLE) + perm |= VM_WRITE; + if (flags & PROCMAP_QUERY_VMA_EXECUTABLE) + perm |= VM_EXEC; + if (flags & PROCMAP_QUERY_VMA_SHARED) + perm |= VM_MAYSHARE; + + if ((vma->vm_flags & perm) != perm) + goto skip_vma; + } + + /* found covering VMA or user is OK with the matching next VMA */ + if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr) + return vma; + +skip_vma: + /* + * If the user needs closest matching VMA, keep iterating. + */ + addr = vma->vm_end; + if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) + goto next_vma; + +no_vma: + return ERR_PTR(-ENOENT); +} + +static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg) +{ + struct procmap_query karg; + struct vm_area_struct *vma; + struct mm_struct *mm; + const char *name = NULL; + char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; + __u64 usize; + int err; + + if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize))) + return -EFAULT; + /* argument struct can never be that large, reject abuse */ + if (usize > PAGE_SIZE) + return -E2BIG; + /* argument struct should have at least query_flags and query_addr fields */ + if (usize < offsetofend(struct procmap_query, query_addr)) + return -EINVAL; + err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize); + if (err) + return err; + + /* reject unknown flags */ + if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK) + return -EINVAL; + /* either both buffer address and size are set, or both should be zero */ + if (!!karg.vma_name_size != !!karg.vma_name_addr) + return -EINVAL; + if (!!karg.build_id_size != !!karg.build_id_addr) + return -EINVAL; + + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) + return -ESRCH; + + err = query_vma_setup(mm); + if (err) { + mmput(mm); + return err; + } + + vma = query_matching_vma(mm, karg.query_addr, karg.query_flags); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + vma = NULL; + goto out; + } + + karg.vma_start = vma->vm_start; + karg.vma_end = vma->vm_end; + + karg.vma_flags = 0; + if (vma->vm_flags & VM_READ) + karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE; + if (vma->vm_flags & VM_WRITE) + karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE; + if (vma->vm_flags & VM_EXEC) + karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE; + if (vma->vm_flags & VM_MAYSHARE) + karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED; + + karg.vma_page_size = vma_kernel_pagesize(vma); + + if (vma->vm_file) { + const struct inode *inode = file_user_inode(vma->vm_file); + + karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT; + karg.dev_major = MAJOR(inode->i_sb->s_dev); + karg.dev_minor = MINOR(inode->i_sb->s_dev); + karg.inode = inode->i_ino; + } else { + karg.vma_offset = 0; + karg.dev_major = 0; + karg.dev_minor = 0; + karg.inode = 0; + } + + if (karg.build_id_size) { + __u32 build_id_sz; + + err = build_id_parse(vma, build_id_buf, &build_id_sz); + if (err) { + karg.build_id_size = 0; + } else { + if (karg.build_id_size < build_id_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.build_id_size = build_id_sz; + } + } + + if (karg.vma_name_size) { + size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); + const struct path *path; + const char *name_fmt; + size_t name_sz = 0; + + get_vma_name(vma, &path, &name, &name_fmt); + + if (path || name_fmt || name) { + name_buf = kmalloc(name_buf_sz, GFP_KERNEL); + if (!name_buf) { + err = -ENOMEM; + goto out; + } + } + if (path) { + name = d_path(path, name_buf, name_buf_sz); + if (IS_ERR(name)) { + err = PTR_ERR(name); + goto out; + } + name_sz = name_buf + name_buf_sz - name; + } else if (name || name_fmt) { + name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name); + name = name_buf; + } + if (name_sz > name_buf_sz) { + err = -ENAMETOOLONG; + goto out; + } + karg.vma_name_size = name_sz; + } + + /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ + query_vma_teardown(mm, vma); + mmput(mm); + + if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), + name, karg.vma_name_size)) { + kfree(name_buf); + return -EFAULT; + } + kfree(name_buf); + + if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr), + build_id_buf, karg.build_id_size)) + return -EFAULT; + + if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize))) + return -EFAULT; + + return 0; + +out: + query_vma_teardown(mm, vma); + mmput(mm); + kfree(name_buf); + return err; +} + +static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + switch (cmd) { + case PROCMAP_QUERY: + return do_procmap_query(priv, (void __user *)arg); + default: + return -ENOIOCTLCMD; + } +} + const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, .release = proc_map_release, + .unlocked_ioctl = procfs_procmap_ioctl, + .compat_ioctl = compat_ptr_ioctl, }; /* @@ -442,7 +702,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked, - bool migration) + bool present) { struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; @@ -471,24 +731,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, * Then accumulate quantities that may depend on sharing, or that may * differ page-by-page. * - * refcount == 1 guarantees the page is mapped exactly once. - * If any subpage of the compound page mapped with PTE it would elevate - * the refcount. + * refcount == 1 for present entries guarantees that the folio is mapped + * exactly once. For large folios this implies that exactly one + * PTE/PMD/... maps (a part of) this folio. + * + * Treat all non-present entries (where relying on the mapcount and + * refcount doesn't make sense) as "maybe shared, but not sure how + * often". We treat device private entries as being fake-present. * - * The page_mapcount() is called to get a snapshot of the mapcount. - * Without holding the page lock this snapshot can be slightly wrong as - * we cannot always read the mapcount atomically. It is not safe to - * call page_mapcount() even with PTL held if the page is not mapped, - * especially for migration entries. Treat regular migration entries - * as mapcount == 1. + * Note that it would not be safe to read the mapcount especially for + * pages referenced by migration entries, even with the PTL held. */ - if ((folio_ref_count(folio) == 1) || migration) { + if (folio_ref_count(folio) == 1 || !present) { smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT, - dirty, locked, true); + dirty, locked, present); return; } + /* + * We obtain a snapshot of the mapcount. Without holding the folio lock + * this snapshot can be slightly wrong as we cannot always read the + * mapcount atomically. + */ for (i = 0; i < nr; i++, page++) { - int mapcount = page_mapcount(page); + int mapcount = folio_precise_page_mapcount(folio, page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; if (mapcount >= 2) pss /= mapcount; @@ -531,13 +796,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false, young = false, dirty = false; + bool present = false, young = false, dirty = false; pte_t ptent = ptep_get(pte); if (pte_present(ptent)) { page = vm_normal_page(vma, addr, ptent); young = pte_young(ptent); dirty = pte_dirty(ptent); + present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); @@ -555,8 +821,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } } else if (is_pfn_swap_entry(swpent)) { - if (is_migration_entry(swpent)) - migration = true; + if (is_device_private_entry(swpent)) + present = true; page = pfn_swap_entry_to_page(swpent); } } else { @@ -567,7 +833,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, young, dirty, locked, migration); + smaps_account(mss, page, false, young, dirty, locked, present); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -578,18 +844,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; + bool present = false; struct folio *folio; - bool migration = false; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); + present = true; } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - if (is_migration_entry(entry)) { - migration = true; + if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); - } } if (IS_ERR_OR_NULL(page)) return; @@ -604,7 +869,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), - locked, migration); + locked, present); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -696,7 +961,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", +#if VM_PKEY_BIT3 [ilog2(VM_PKEY_BIT3)] = "", +#endif #if VM_PKEY_BIT4 [ilog2(VM_PKEY_BIT4)] = "", #endif @@ -707,6 +974,12 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) + [ilog2(VM_DROPPABLE)] = "dp", +#endif +#ifdef CONFIG_64BIT + [ilog2(VM_SEALED)] = "sl", +#endif }; size_t i; @@ -730,19 +1003,23 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = walk->vma; - pte_t ptent = huge_ptep_get(pte); + pte_t ptent = huge_ptep_get(walk->mm, addr, pte); struct folio *folio = NULL; + bool present = false; if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); + present = true; } else if (is_swap_pte(ptent)) { swp_entry_t swpent = pte_to_swp_entry(ptent); if (is_pfn_swap_entry(swpent)) folio = pfn_swap_entry_folio(swpent); } + if (folio) { - if (folio_likely_mapped_shared(folio) || + /* We treat non-present entries as "maybe shared". */ + if (!present || folio_likely_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else @@ -1088,7 +1365,7 @@ struct clear_refs_private { static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - struct page *page; + struct folio *folio; if (!pte_write(pte)) return false; @@ -1096,10 +1373,10 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, return false; if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))) return false; - page = vm_normal_page(vma, addr, pte); - if (!page) + folio = vm_normal_folio(vma, addr, pte); + if (!folio) return false; - return page_maybe_dma_pinned(page); + return folio_maybe_dma_pinned(folio); } static inline void clear_soft_dirty(struct vm_area_struct *vma, @@ -1415,7 +1692,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, { u64 frame = 0, flags = 0; struct page *page = NULL; - bool migration = false; + struct folio *folio; if (pte_present(pte)) { if (pm->show_pfn) @@ -1447,17 +1724,20 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; - migration = is_migration_entry(entry); if (is_pfn_swap_entry(entry)) page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; } - if (page && !PageAnon(page)) - flags |= PM_FILE; - if (page && !migration && page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + if ((flags & PM_PRESENT) && + folio_precise_page_mapcount(folio, page) == 1) + flags |= PM_MMAP_EXCLUSIVE; + } if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1473,13 +1753,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pte_t *pte, *orig_pte; int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - bool migration = false; ptl = pmd_trans_huge_lock(pmdp, vma); if (ptl) { + unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; struct page *page = NULL; + struct folio *folio = NULL; if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; @@ -1493,8 +1774,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_uffd_wp(pmd)) flags |= PM_UFFD_WP; if (pm->show_pfn) - frame = pmd_pfn(pmd) + - ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = pmd_pfn(pmd) + idx; } #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { @@ -1503,11 +1783,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pm->show_pfn) { if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); + offset = swp_offset_pfn(entry) + idx; else - offset = swp_offset(entry); - offset = offset + - ((addr & ~PMD_MASK) >> PAGE_SHIFT); + offset = swp_offset(entry) + idx; frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } @@ -1517,17 +1795,25 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (pmd_swp_uffd_wp(pmd)) flags |= PM_UFFD_WP; VM_BUG_ON(!is_pmd_migration_entry(pmd)); - migration = is_migration_entry(entry); page = pfn_swap_entry_to_page(entry); } #endif - if (page && !migration && page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + } + + for (; addr != end; addr += PAGE_SIZE, idx++) { + unsigned long cur_flags = flags; + pagemap_entry_t pme; - for (; addr != end; addr += PAGE_SIZE) { - pagemap_entry_t pme = make_pme(frame, flags); + if (folio && (flags & PM_PRESENT) && + folio_precise_page_mapcount(folio, page + idx) == 1) + cur_flags |= PM_MMAP_EXCLUSIVE; + pme = make_pme(frame, cur_flags); err = add_to_pagemap(&pme, pm); if (err) break; @@ -1582,7 +1868,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, addr, ptep); if (pte_present(pte)) { struct folio *folio = page_folio(pte_page(pte)); @@ -2271,7 +2557,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, if (~p->arg.flags & PM_SCAN_WP_MATCHING) { /* Go the short route when not write-protecting pages. */ - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) @@ -2283,7 +2569,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, i_mmap_lock_write(vma->vm_file->f_mapping); ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); - pte = huge_ptep_get(ptep); + pte = huge_ptep_get(walk->mm, start, ptep); categories = p->cur_vma_category | pagemap_hugetlb_category(pte); if (!pagemap_scan_is_interesting_page(categories, p)) @@ -2563,7 +2849,7 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); - int count = page_mapcount(page); + int count = folio_precise_page_mapcount(folio, page); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) @@ -2679,7 +2965,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pte_t huge_pte = huge_ptep_get(pte); + pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte); struct numa_maps *md; struct page *page; |