summaryrefslogtreecommitdiff
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/base.c118
-rw-r--r--fs/proc/consoles.c7
-rw-r--r--fs/proc/fd.c18
-rw-r--r--fs/proc/generic.c10
-rw-r--r--fs/proc/inode.c31
-rw-r--r--fs/proc/internal.h45
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/proc/page.c53
-rw-r--r--fs/proc/proc_sysctl.c81
-rw-r--r--fs/proc/task_mmu.c496
10 files changed, 630 insertions, 231 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..b31283d81c52 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -85,6 +85,7 @@
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/fs_parser.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
@@ -117,6 +118,40 @@
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;
+enum proc_mem_force {
+ PROC_MEM_FORCE_ALWAYS,
+ PROC_MEM_FORCE_PTRACE,
+ PROC_MEM_FORCE_NEVER
+};
+
+static enum proc_mem_force proc_mem_force_override __ro_after_init =
+ IS_ENABLED(CONFIG_PROC_MEM_NO_FORCE) ? PROC_MEM_FORCE_NEVER :
+ IS_ENABLED(CONFIG_PROC_MEM_FORCE_PTRACE) ? PROC_MEM_FORCE_PTRACE :
+ PROC_MEM_FORCE_ALWAYS;
+
+static const struct constant_table proc_mem_force_table[] __initconst = {
+ { "always", PROC_MEM_FORCE_ALWAYS },
+ { "ptrace", PROC_MEM_FORCE_PTRACE },
+ { "never", PROC_MEM_FORCE_NEVER },
+ { }
+};
+
+static int __init early_proc_mem_force_override(char *buf)
+{
+ if (!buf)
+ return -EINVAL;
+
+ /*
+ * lookup_constant() defaults to proc_mem_force_override to preseve
+ * the initial Kconfig choice in case an invalid param gets passed.
+ */
+ proc_mem_force_override = lookup_constant(proc_mem_force_table,
+ buf, proc_mem_force_override);
+
+ return 0;
+}
+early_param("proc_mem.force_override", early_proc_mem_force_override);
+
struct pid_entry {
const char *name;
unsigned int len;
@@ -827,12 +862,31 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode)
static int mem_open(struct inode *inode, struct file *file)
{
- int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH);
-
- /* OK to pass negative loff_t, we can catch out-of-range */
- file->f_mode |= FMODE_UNSIGNED_OFFSET;
+ if (WARN_ON_ONCE(!(file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)))
+ return -EINVAL;
+ return __mem_open(inode, file, PTRACE_MODE_ATTACH);
+}
- return ret;
+static bool proc_mem_foll_force(struct file *file, struct mm_struct *mm)
+{
+ struct task_struct *task;
+ bool ptrace_active = false;
+
+ switch (proc_mem_force_override) {
+ case PROC_MEM_FORCE_NEVER:
+ return false;
+ case PROC_MEM_FORCE_PTRACE:
+ task = get_proc_task(file_inode(file));
+ if (task) {
+ ptrace_active = READ_ONCE(task->ptrace) &&
+ READ_ONCE(task->mm) == mm &&
+ READ_ONCE(task->parent) == current;
+ put_task_struct(task);
+ }
+ return ptrace_active;
+ default:
+ return true;
+ }
}
static ssize_t mem_rw(struct file *file, char __user *buf,
@@ -855,7 +909,9 @@ static ssize_t mem_rw(struct file *file, char __user *buf,
if (!mmget_not_zero(mm))
goto free;
- flags = FOLL_FORCE | (write ? FOLL_WRITE : 0);
+ flags = write ? FOLL_WRITE : 0;
+ if (proc_mem_foll_force(file, mm))
+ flags |= FOLL_FORCE;
while (count > 0) {
size_t this_len = min_t(size_t, count, PAGE_SIZE);
@@ -932,6 +988,7 @@ static const struct file_operations proc_mem_operations = {
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .fop_flags = FOP_UNSIGNED_OFFSET,
};
static int environ_open(struct inode *inode, struct file *file)
@@ -2276,8 +2333,8 @@ proc_map_files_instantiate(struct dentry *dentry,
inode->i_op = &proc_map_files_link_inode_operations;
inode->i_size = 64;
- d_set_d_op(dentry, &tid_map_files_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_map_files_dentry_operations);
}
static struct dentry *proc_map_files_lookup(struct inode *dir,
@@ -2456,13 +2513,13 @@ static void *timers_start(struct seq_file *m, loff_t *pos)
if (!tp->sighand)
return ERR_PTR(-ESRCH);
- return seq_list_start(&tp->task->signal->posix_timers, *pos);
+ return seq_hlist_start(&tp->task->signal->posix_timers, *pos);
}
static void *timers_next(struct seq_file *m, void *v, loff_t *pos)
{
struct timers_private *tp = m->private;
- return seq_list_next(v, &tp->task->signal->posix_timers, pos);
+ return seq_hlist_next(v, &tp->task->signal->posix_timers, pos);
}
static void timers_stop(struct seq_file *m, void *v)
@@ -2491,7 +2548,7 @@ static int show_timer(struct seq_file *m, void *v)
[SIGEV_THREAD] = "thread",
};
- timer = list_entry((struct list_head *)v, struct k_itimer, list);
+ timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list);
notify = timer->it_sigev_notify;
seq_printf(m, "ID: %d\n", timer->it_id);
@@ -2569,10 +2626,11 @@ static ssize_t timerslack_ns_write(struct file *file, const char __user *buf,
}
task_lock(p);
- if (slack_ns == 0)
- p->timer_slack_ns = p->default_timer_slack_ns;
- else
- p->timer_slack_ns = slack_ns;
+ if (rt_or_dl_task_policy(p))
+ slack_ns = 0;
+ else if (slack_ns == 0)
+ slack_ns = p->default_timer_slack_ns;
+ p->timer_slack_ns = slack_ns;
task_unlock(p);
out:
@@ -3870,12 +3928,12 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit_dots(file, ctx))
return 0;
- /* f_version caches the tgid value that the last readdir call couldn't
- * return. lseek aka telldir automagically resets f_version to 0.
+ /* We cache the tgid value that the last readdir call couldn't
+ * return and lseek resets it to 0.
*/
ns = proc_pid_ns(inode->i_sb);
- tid = (int)file->f_version;
- file->f_version = 0;
+ tid = (int)(intptr_t)file->private_data;
+ file->private_data = NULL;
for (task = first_tid(proc_pid(inode), tid, ctx->pos - 2, ns);
task;
task = next_tid(task), ctx->pos++) {
@@ -3890,7 +3948,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx)
proc_task_instantiate, task, NULL)) {
/* returning this tgid failed, save it as the first
* pid for the next readir call */
- file->f_version = (u64)tid;
+ file->private_data = (void *)(intptr_t)tid;
put_task_struct(task);
break;
}
@@ -3915,6 +3973,24 @@ static int proc_task_getattr(struct mnt_idmap *idmap,
return 0;
}
+/*
+ * proc_task_readdir() set @file->private_data to a positive integer
+ * value, so casting that to u64 is safe. generic_llseek_cookie() will
+ * set @cookie to 0, so casting to an int is safe. The WARN_ON_ONCE() is
+ * here to catch any unexpected change in behavior either in
+ * proc_task_readdir() or generic_llseek_cookie().
+ */
+static loff_t proc_dir_llseek(struct file *file, loff_t offset, int whence)
+{
+ u64 cookie = (u64)(intptr_t)file->private_data;
+ loff_t off;
+
+ off = generic_llseek_cookie(file, offset, whence, &cookie);
+ WARN_ON_ONCE(cookie > INT_MAX);
+ file->private_data = (void *)(intptr_t)cookie; /* serialized by f_pos_lock */
+ return off;
+}
+
static const struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
.getattr = proc_task_getattr,
@@ -3925,7 +4001,7 @@ static const struct inode_operations proc_task_inode_operations = {
static const struct file_operations proc_task_operations = {
.read = generic_read_dir,
.iterate_shared = proc_task_readdir,
- .llseek = generic_file_llseek,
+ .llseek = proc_dir_llseek,
};
void __init set_proc_pid_nlink(void)
diff --git a/fs/proc/consoles.c b/fs/proc/consoles.c
index e0758fe7936d..b7cab1ad990d 100644
--- a/fs/proc/consoles.c
+++ b/fs/proc/consoles.c
@@ -21,6 +21,7 @@ static int show_console_dev(struct seq_file *m, void *v)
{ CON_ENABLED, 'E' },
{ CON_CONSDEV, 'C' },
{ CON_BOOT, 'B' },
+ { CON_NBCON, 'N' },
{ CON_PRINTBUFFER, 'p' },
{ CON_BRL, 'b' },
{ CON_ANYTIME, 'a' },
@@ -58,8 +59,8 @@ static int show_console_dev(struct seq_file *m, void *v)
seq_printf(m, "%s%d", con->name, con->index);
seq_pad(m, ' ');
seq_printf(m, "%c%c%c (%s)", con->read ? 'R' : '-',
- con->write ? 'W' : '-', con->unblank ? 'U' : '-',
- flags);
+ ((con->flags & CON_NBCON) || con->write) ? 'W' : '-',
+ con->unblank ? 'U' : '-', flags);
if (dev)
seq_printf(m, " %4d:%d", MAJOR(dev), MINOR(dev));
@@ -68,6 +69,7 @@ static int show_console_dev(struct seq_file *m, void *v)
}
static void *c_start(struct seq_file *m, loff_t *pos)
+ __acquires(&console_mutex)
{
struct console *con;
loff_t off = 0;
@@ -94,6 +96,7 @@ static void *c_next(struct seq_file *m, void *v, loff_t *pos)
}
static void c_stop(struct seq_file *m, void *v)
+ __releases(&console_mutex)
{
console_list_unlock();
}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 586bbc84ca04..1f54a54bfb91 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -59,7 +59,7 @@ static int seq_show(struct seq_file *m, void *v)
real_mount(file->f_path.mnt)->mnt_id,
file_inode(file)->i_ino);
- /* show_fd_locks() never deferences files so a stale value is safe */
+ /* show_fd_locks() never dereferences files, so a stale value is safe */
show_fd_locks(m, file, files);
if (seq_has_overflowed(m))
goto out;
@@ -220,8 +220,8 @@ static struct dentry *proc_fd_instantiate(struct dentry *dentry,
ei->op.proc_get_link = proc_fd_link;
tid_fd_update_inode(task, inode, data->mode);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *proc_lookupfd_common(struct inode *dir,
@@ -312,14 +312,14 @@ static int proc_readfd_count(struct inode *inode, loff_t *count)
return 0;
}
-static int proc_readfd(struct file *file, struct dir_context *ctx)
+static int proc_fd_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx, proc_fd_instantiate);
}
const struct file_operations proc_fd_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfd,
+ .iterate_shared = proc_fd_iterate,
.llseek = generic_file_llseek,
};
@@ -397,8 +397,8 @@ static struct dentry *proc_fdinfo_instantiate(struct dentry *dentry,
inode->i_fop = &proc_fdinfo_file_operations;
tid_fd_update_inode(task, inode, 0);
- d_set_d_op(dentry, &tid_fd_dentry_operations);
- return d_splice_alias(inode, dentry);
+ return proc_splice_unmountable(inode, dentry,
+ &tid_fd_dentry_operations);
}
static struct dentry *
@@ -407,7 +407,7 @@ proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
}
-static int proc_readfdinfo(struct file *file, struct dir_context *ctx)
+static int proc_fdinfo_iterate(struct file *file, struct dir_context *ctx)
{
return proc_readfd_common(file, ctx,
proc_fdinfo_instantiate);
@@ -421,6 +421,6 @@ const struct inode_operations proc_fdinfo_inode_operations = {
const struct file_operations proc_fdinfo_operations = {
.read = generic_read_dir,
- .iterate_shared = proc_readfdinfo,
+ .iterate_shared = proc_fdinfo_iterate,
.llseek = generic_file_llseek,
};
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 775ce0bcf08c..dbe82cf23ee4 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -202,8 +202,8 @@ int proc_alloc_inum(unsigned int *inum)
{
int i;
- i = ida_simple_get(&proc_inum_ida, 0, UINT_MAX - PROC_DYNAMIC_FIRST + 1,
- GFP_KERNEL);
+ i = ida_alloc_max(&proc_inum_ida, UINT_MAX - PROC_DYNAMIC_FIRST,
+ GFP_KERNEL);
if (i < 0)
return i;
@@ -213,7 +213,7 @@ int proc_alloc_inum(unsigned int *inum)
void proc_free_inum(unsigned int inum)
{
- ida_simple_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
+ ida_free(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
}
static int proc_misc_d_revalidate(struct dentry *dentry, unsigned int flags)
@@ -464,9 +464,9 @@ struct proc_dir_entry *proc_symlink(const char *name,
(S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
if (ent) {
- ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
+ ent->size = strlen(dest);
+ ent->data = kmemdup(dest, ent->size + 1, GFP_KERNEL);
if (ent->data) {
- strcpy((char*)ent->data,dest);
ent->proc_iops = &proc_link_inode_operations;
ent = proc_register(parent, ent);
} else {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index d19434e2a58e..626ad7bd94f2 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -303,9 +303,7 @@ static ssize_t proc_reg_read_iter(struct kiocb *iocb, struct iov_iter *iter)
static ssize_t pde_read(struct proc_dir_entry *pde, struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_read) read;
-
- read = pde->proc_ops->proc_read;
+ __auto_type read = pde->proc_ops->proc_read;
if (read)
return read(file, buf, count, ppos);
return -EIO;
@@ -327,9 +325,7 @@ static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count,
static ssize_t pde_write(struct proc_dir_entry *pde, struct file *file, const char __user *buf, size_t count, loff_t *ppos)
{
- typeof_member(struct proc_ops, proc_write) write;
-
- write = pde->proc_ops->proc_write;
+ __auto_type write = pde->proc_ops->proc_write;
if (write)
return write(file, buf, count, ppos);
return -EIO;
@@ -351,9 +347,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
static __poll_t pde_poll(struct proc_dir_entry *pde, struct file *file, struct poll_table_struct *pts)
{
- typeof_member(struct proc_ops, proc_poll) poll;
-
- poll = pde->proc_ops->proc_poll;
+ __auto_type poll = pde->proc_ops->proc_poll;
if (poll)
return poll(file, pts);
return DEFAULT_POLLMASK;
@@ -375,9 +369,7 @@ static __poll_t proc_reg_poll(struct file *file, struct poll_table_struct *pts)
static long pde_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_ioctl) ioctl;
-
- ioctl = pde->proc_ops->proc_ioctl;
+ __auto_type ioctl = pde->proc_ops->proc_ioctl;
if (ioctl)
return ioctl(file, cmd, arg);
return -ENOTTY;
@@ -400,9 +392,7 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
#ifdef CONFIG_COMPAT
static long pde_compat_ioctl(struct proc_dir_entry *pde, struct file *file, unsigned int cmd, unsigned long arg)
{
- typeof_member(struct proc_ops, proc_compat_ioctl) compat_ioctl;
-
- compat_ioctl = pde->proc_ops->proc_compat_ioctl;
+ __auto_type compat_ioctl = pde->proc_ops->proc_compat_ioctl;
if (compat_ioctl)
return compat_ioctl(file, cmd, arg);
return -ENOTTY;
@@ -424,9 +414,7 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
static int pde_mmap(struct proc_dir_entry *pde, struct file *file, struct vm_area_struct *vma)
{
- typeof_member(struct proc_ops, proc_mmap) mmap;
-
- mmap = pde->proc_ops->proc_mmap;
+ __auto_type mmap = pde->proc_ops->proc_mmap;
if (mmap)
return mmap(file, vma);
return -EIO;
@@ -483,7 +471,6 @@ static int proc_reg_open(struct inode *inode, struct file *file)
struct proc_dir_entry *pde = PDE(inode);
int rv = 0;
typeof_member(struct proc_ops, proc_open) open;
- typeof_member(struct proc_ops, proc_release) release;
struct pde_opener *pdeo;
if (!pde->proc_ops->proc_lseek)
@@ -510,7 +497,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
if (!use_pde(pde))
return -ENOENT;
- release = pde->proc_ops->proc_release;
+ __auto_type release = pde->proc_ops->proc_release;
if (release) {
pdeo = kmem_cache_alloc(pde_opener_cache, GFP_KERNEL);
if (!pdeo) {
@@ -547,9 +534,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
struct pde_opener *pdeo;
if (pde_is_permanent(pde)) {
- typeof_member(struct proc_ops, proc_release) release;
-
- release = pde->proc_ops->proc_release;
+ __auto_type release = pde->proc_ops->proc_release;
if (release) {
return release(inode, file);
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index a71ac5379584..87e4d6282025 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -13,6 +13,7 @@
#include <linux/binfmts.h>
#include <linux/sched/coredump.h>
#include <linux/sched/task.h>
+#include <linux/mm.h>
struct ctl_table_header;
struct mempolicy;
@@ -142,6 +143,37 @@ unsigned name_to_int(const struct qstr *qstr);
/* Worst case buffer size needed for holding an integer. */
#define PROC_NUMBUF 13
+/**
+ * folio_precise_page_mapcount() - Number of mappings of this folio page.
+ * @folio: The folio.
+ * @page: The page.
+ *
+ * The number of present user page table entries that reference this page
+ * as tracked via the RMAP: either referenced directly (PTE) or as part of
+ * a larger area that covers this page (e.g., PMD).
+ *
+ * Use this function only for the calculation of existing statistics
+ * (USS, PSS, mapcount_max) and for debugging purposes (/proc/kpagecount).
+ *
+ * Do not add new users.
+ *
+ * Returns: The number of mappings of this folio page. 0 for
+ * folios that are not mapped to user space or are not tracked via the RMAP
+ * (e.g., shared zeropage).
+ */
+static inline int folio_precise_page_mapcount(struct folio *folio,
+ struct page *page)
+{
+ int mapcount = atomic_read(&page->_mapcount) + 1;
+
+ if (page_mapcount_is_type(mapcount))
+ mapcount = 0;
+ if (folio_test_large(folio))
+ mapcount += folio_entire_mapcount(folio);
+
+ return mapcount;
+}
+
/*
* array.c
*/
@@ -316,3 +348,16 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde)
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
pde->proc_dops = &proc_net_dentry_ops;
}
+
+/*
+ * Add a new procfs dentry that can't serve as a mountpoint. That should
+ * encompass anything that is ephemeral and can just disappear while the
+ * process is still around.
+ */
+static inline struct dentry *proc_splice_unmountable(struct inode *inode,
+ struct dentry *dentry, const struct dentry_operations *d_ops)
+{
+ d_set_d_op(dentry, d_ops);
+ dont_mount(dentry);
+ return d_splice_alias(inode, dentry);
+}
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8e08a9a1b7ed..7d0acdad74e2 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -235,7 +235,7 @@ static int kcore_ram_list(struct list_head *list)
int nid, ret;
unsigned long end_pfn;
- /* Not inialized....update now */
+ /* Not initialized....update now */
/* find out "max pfn" */
end_pfn = 0;
for_each_node_state(nid, N_MEMORY) {
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 2fb64bdb64eb..a55f5acefa97 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -37,21 +37,19 @@ static inline unsigned long get_max_dump_pfn(void)
#endif
}
-/* /proc/kpagecount - an array exposing page counts
+/* /proc/kpagecount - an array exposing page mapcounts
*
* Each entry is a u64 representing the corresponding
- * physical page count.
+ * physical page mapcount.
*/
static ssize_t kpagecount_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
const unsigned long max_dump_pfn = get_max_dump_pfn();
u64 __user *out = (u64 __user *)buf;
- struct page *ppage;
unsigned long src = *ppos;
unsigned long pfn;
ssize_t ret = 0;
- u64 pcount;
pfn = src / KPMSIZE;
if (src & KPMMASK || count & KPMMASK)
@@ -61,18 +59,19 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src);
while (count > 0) {
+ struct page *page;
+ u64 mapcount = 0;
+
/*
* TODO: ZONE_DEVICE support requires to identify
* memmaps that were actually initialized.
*/
- ppage = pfn_to_online_page(pfn);
+ page = pfn_to_online_page(pfn);
+ if (page)
+ mapcount = folio_precise_page_mapcount(page_folio(page),
+ page);
- if (!ppage)
- pcount = 0;
- else
- pcount = page_mapcount(ppage);
-
- if (put_user(pcount, out)) {
+ if (put_user(mapcount, out)) {
ret = -EFAULT;
break;
}
@@ -148,19 +147,16 @@ u64 stable_page_flags(const struct page *page)
u |= 1 << KPF_COMPOUND_TAIL;
if (folio_test_hugetlb(folio))
u |= 1 << KPF_HUGE;
- /*
- * We need to check PageLRU/PageAnon
- * to make sure a given page is a thp, not a non-huge compound page.
- */
- else if (folio_test_large(folio)) {
- if ((k & (1 << PG_lru)) || is_anon)
- u |= 1 << KPF_THP;
- else if (is_huge_zero_folio(folio)) {
- u |= 1 << KPF_ZERO_PAGE;
- u |= 1 << KPF_THP;
- }
- } else if (is_zero_pfn(page_to_pfn(page)))
+ else if (folio_test_large(folio) &&
+ folio_test_large_rmappable(folio)) {
+ /* Note: we indicate any THPs here, not just PMD-sized ones */
+ u |= 1 << KPF_THP;
+ } else if (is_huge_zero_folio(folio)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ } else if (is_zero_folio(folio)) {
u |= 1 << KPF_ZERO_PAGE;
+ }
/*
* Caveats on high order pages: PG_buddy and PG_slab will only be set
@@ -186,7 +182,6 @@ u64 stable_page_flags(const struct page *page)
#endif
u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
- u |= kpf_copy_bit(k, KPF_ERROR, PG_error);
u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty);
u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate);
u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback);
@@ -211,18 +206,16 @@ u64 stable_page_flags(const struct page *page)
u |= kpf_copy_bit(page->flags, KPF_HWPOISON, PG_hwpoison);
#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached);
-#endif
-
u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved);
- u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk);
+ u |= kpf_copy_bit(k, KPF_OWNER_2, PG_owner_2);
u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private);
u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2);
u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1);
u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1);
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2);
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
u |= kpf_copy_bit(k, KPF_ARCH_3, PG_arch_3);
#endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b1c2c0b82116..d11ebc055ce0 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -21,7 +21,7 @@
#define list_for_each_table_entry(entry, header) \
entry = header->ctl_table; \
- for (size_t i = 0 ; i < header->ctl_table_size && entry->procname; ++i, entry++)
+ for (size_t i = 0 ; i < header->ctl_table_size; ++i, entry++)
static const struct dentry_operations proc_sys_dentry_operations;
static const struct file_operations proc_sys_file_operations;
@@ -29,8 +29,13 @@ static const struct inode_operations proc_sys_inode_operations;
static const struct file_operations proc_sys_dir_file_operations;
static const struct inode_operations proc_sys_dir_operations;
-/* Support for permanently empty directories */
-static struct ctl_table sysctl_mount_point[] = { };
+/*
+ * Support for permanently empty directories.
+ * Must be non-empty to avoid sharing an address with other tables.
+ */
+static struct ctl_table sysctl_mount_point[] = {
+ { }
+};
/**
* register_sysctl_mount_point() - registers a sysctl mount point
@@ -42,7 +47,7 @@ static struct ctl_table sysctl_mount_point[] = { };
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl(path, sysctl_mount_point);
+ return register_sysctl_sz(path, sysctl_mount_point, 0);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
@@ -476,12 +481,10 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
make_empty_dir_inode(inode);
}
+ inode->i_uid = GLOBAL_ROOT_UID;
+ inode->i_gid = GLOBAL_ROOT_GID;
if (root->set_ownership)
root->set_ownership(head, &inode->i_uid, &inode->i_gid);
- else {
- inode->i_uid = GLOBAL_ROOT_UID;
- inode->i_gid = GLOBAL_ROOT_GID;
- }
return inode;
}
@@ -951,14 +954,14 @@ static struct ctl_dir *new_dir(struct ctl_table_set *set,
char *new_name;
new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
- sizeof(struct ctl_table)*2 + namelen + 1,
+ sizeof(struct ctl_table) + namelen + 1,
GFP_KERNEL);
if (!new)
return NULL;
node = (struct ctl_node *)(new + 1);
table = (struct ctl_table *)(node + 1);
- new_name = (char *)(table + 2);
+ new_name = (char *)(table + 1);
memcpy(new_name, name, namelen);
table[0].procname = new_name;
table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
@@ -1093,6 +1096,7 @@ static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
static int sysctl_check_table_array(const char *path, struct ctl_table *table)
{
+ unsigned int extra;
int err = 0;
if ((table->proc_handler == proc_douintvec) ||
@@ -1104,6 +1108,19 @@ static int sysctl_check_table_array(const char *path, struct ctl_table *table)
if (table->proc_handler == proc_dou8vec_minmax) {
if (table->maxlen != sizeof(u8))
err |= sysctl_err(path, table, "array not allowed");
+
+ if (table->extra1) {
+ extra = *(unsigned int *) table->extra1;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
+ if (table->extra2) {
+ extra = *(unsigned int *) table->extra2;
+ if (extra > 255U)
+ err |= sysctl_err(path, table,
+ "range value too large for proc_dou8vec_minmax");
+ }
}
if (table->proc_handler == proc_dobool) {
@@ -1119,6 +1136,8 @@ static int sysctl_check_table(const char *path, struct ctl_table_header *header)
struct ctl_table *entry;
int err = 0;
list_for_each_table_entry(entry, header) {
+ if (!entry->procname)
+ err |= sysctl_err(path, entry, "procname is null");
if ((entry->proc_handler == proc_dostring) ||
(entry->proc_handler == proc_dobool) ||
(entry->proc_handler == proc_dointvec) ||
@@ -1154,18 +1173,16 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
struct ctl_table_header *links;
struct ctl_node *node;
char *link_name;
- int nr_entries, name_bytes;
+ int name_bytes;
name_bytes = 0;
- nr_entries = 0;
list_for_each_table_entry(entry, head) {
- nr_entries++;
name_bytes += strlen(entry->procname) + 1;
}
links = kzalloc(sizeof(struct ctl_table_header) +
- sizeof(struct ctl_node)*nr_entries +
- sizeof(struct ctl_table)*(nr_entries + 1) +
+ sizeof(struct ctl_node)*head->ctl_table_size +
+ sizeof(struct ctl_table)*head->ctl_table_size +
name_bytes,
GFP_KERNEL);
@@ -1173,8 +1190,8 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
return NULL;
node = (struct ctl_node *)(links + 1);
- link_table = (struct ctl_table *)(node + nr_entries);
- link_name = (char *)&link_table[nr_entries + 1];
+ link_table = (struct ctl_table *)(node + head->ctl_table_size);
+ link_name = (char *)(link_table + head->ctl_table_size);
link = link_table;
list_for_each_table_entry(entry, head) {
@@ -1188,7 +1205,7 @@ static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table_
}
init_header(links, dir->header.root, dir->header.set, node, link_table,
head->ctl_table_size);
- links->nreg = nr_entries;
+ links->nreg = head->ctl_table_size;
return links;
}
@@ -1300,28 +1317,23 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* __register_sysctl_table - register a leaf sysctl table
* @set: Sysctl tree to register on
* @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure without any child. This table
- * should not be free'd after registration. So it should not be
- * used on stack. It can either be a global or dynamically allocated
- * by the caller and free'd later after sysctl unregistration.
+ *
+ * @table: the top-level table structure. This table should not be free'd
+ * after registration. So it should not be used on stack. It can either
+ * be a global or dynamically allocated by the caller and free'd later
+ * after sysctl unregistration.
* @table_size : The number of elements in table
*
* Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
+ * array.
*
* The members of the &struct ctl_table structure are used as follows:
- *
* procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
* enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file
- *
- * child - must be %NULL.
- *
+ * data - a pointer to data for use by proc_handler
+ * maxlen - the maximum size in bytes of the data
+ * mode - the file permissions for the /proc/sys file
+ * type - Defines the target type (described in struct definition)
* proc_handler - the text handler routine (described below)
*
* extra1, extra2 - extra pointers usable by the proc handler routines
@@ -1329,8 +1341,7 @@ static struct ctl_dir *sysctl_mkdir_p(struct ctl_dir *dir, const char *path)
* [0] https://lkml.kernel.org/87zgpte9o4.fsf@email.froward.int.ebiederm.org
*
* Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes (where child is not NULL) are not allowed,
- * sysctl_check_table() verifies this.
+ * under /proc; non-leaf nodes are not allowed.
*
* There must be a proc_handler routine for any terminal nodes.
* Several default handlers are available to cover common cases -
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f8d35f993fe5..72f14fd59c2d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -22,6 +22,7 @@
#include <linux/pkeys.h>
#include <linux/minmax.h>
#include <linux/overflow.h>
+#include <linux/buildid.h>
#include <asm/elf.h>
#include <asm/tlb.h>
@@ -239,6 +240,67 @@ static int do_maps_open(struct inode *inode, struct file *file,
sizeof(struct proc_maps_private));
}
+static void get_vma_name(struct vm_area_struct *vma,
+ const struct path **path,
+ const char **name,
+ const char **name_fmt)
+{
+ struct anon_vma_name *anon_name = vma->vm_mm ? anon_vma_name(vma) : NULL;
+
+ *name = NULL;
+ *path = NULL;
+ *name_fmt = NULL;
+
+ /*
+ * Print the dentry name for named mappings, and a
+ * special [heap] marker for the heap:
+ */
+ if (vma->vm_file) {
+ /*
+ * If user named this anon shared memory via
+ * prctl(PR_SET_VMA ..., use the provided name.
+ */
+ if (anon_name) {
+ *name_fmt = "[anon_shmem:%s]";
+ *name = anon_name->name;
+ } else {
+ *path = file_user_path(vma->vm_file);
+ }
+ return;
+ }
+
+ if (vma->vm_ops && vma->vm_ops->name) {
+ *name = vma->vm_ops->name(vma);
+ if (*name)
+ return;
+ }
+
+ *name = arch_vma_name(vma);
+ if (*name)
+ return;
+
+ if (!vma->vm_mm) {
+ *name = "[vdso]";
+ return;
+ }
+
+ if (vma_is_initial_heap(vma)) {
+ *name = "[heap]";
+ return;
+ }
+
+ if (vma_is_initial_stack(vma)) {
+ *name = "[stack]";
+ return;
+ }
+
+ if (anon_name) {
+ *name_fmt = "[anon:%s]";
+ *name = anon_name->name;
+ return;
+ }
+}
+
static void show_vma_header_prefix(struct seq_file *m,
unsigned long start, unsigned long end,
vm_flags_t flags, unsigned long long pgoff,
@@ -262,17 +324,15 @@ static void show_vma_header_prefix(struct seq_file *m,
static void
show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
{
- struct anon_vma_name *anon_name = NULL;
- struct mm_struct *mm = vma->vm_mm;
- struct file *file = vma->vm_file;
+ const struct path *path;
+ const char *name_fmt, *name;
vm_flags_t flags = vma->vm_flags;
unsigned long ino = 0;
unsigned long long pgoff = 0;
unsigned long start, end;
dev_t dev = 0;
- const char *name = NULL;
- if (file) {
+ if (vma->vm_file) {
const struct inode *inode = file_user_inode(vma->vm_file);
dev = inode->i_sb->s_dev;
@@ -283,57 +343,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
start = vma->vm_start;
end = vma->vm_end;
show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino);
- if (mm)
- anon_name = anon_vma_name(vma);
- /*
- * Print the dentry name for named mappings, and a
- * special [heap] marker for the heap:
- */
- if (file) {
+ get_vma_name(vma, &path, &name, &name_fmt);
+ if (path) {
seq_pad(m, ' ');
- /*
- * If user named this anon shared memory via
- * prctl(PR_SET_VMA ..., use the provided name.
- */
- if (anon_name)
- seq_printf(m, "[anon_shmem:%s]", anon_name->name);
- else
- seq_path(m, file_user_path(file), "\n");
- goto done;
- }
-
- if (vma->vm_ops && vma->vm_ops->name) {
- name = vma->vm_ops->name(vma);
- if (name)
- goto done;
- }
-
- name = arch_vma_name(vma);
- if (!name) {
- if (!mm) {
- name = "[vdso]";
- goto done;
- }
-
- if (vma_is_initial_heap(vma)) {
- name = "[heap]";
- goto done;
- }
-
- if (vma_is_initial_stack(vma)) {
- name = "[stack]";
- goto done;
- }
-
- if (anon_name) {
- seq_pad(m, ' ');
- seq_printf(m, "[anon:%s]", anon_name->name);
- }
- }
-
-done:
- if (name) {
+ seq_path(m, path, "\n");
+ } else if (name_fmt) {
+ seq_pad(m, ' ');
+ seq_printf(m, name_fmt, name);
+ } else if (name) {
seq_pad(m, ' ');
seq_puts(m, name);
}
@@ -358,11 +376,253 @@ static int pid_maps_open(struct inode *inode, struct file *file)
return do_maps_open(inode, file, &proc_pid_maps_op);
}
+#define PROCMAP_QUERY_VMA_FLAGS ( \
+ PROCMAP_QUERY_VMA_READABLE | \
+ PROCMAP_QUERY_VMA_WRITABLE | \
+ PROCMAP_QUERY_VMA_EXECUTABLE | \
+ PROCMAP_QUERY_VMA_SHARED \
+)
+
+#define PROCMAP_QUERY_VALID_FLAGS_MASK ( \
+ PROCMAP_QUERY_COVERING_OR_NEXT_VMA | \
+ PROCMAP_QUERY_FILE_BACKED_VMA | \
+ PROCMAP_QUERY_VMA_FLAGS \
+)
+
+static int query_vma_setup(struct mm_struct *mm)
+{
+ return mmap_read_lock_killable(mm);
+}
+
+static void query_vma_teardown(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ mmap_read_unlock(mm);
+}
+
+static struct vm_area_struct *query_vma_find_by_addr(struct mm_struct *mm, unsigned long addr)
+{
+ return find_vma(mm, addr);
+}
+
+static struct vm_area_struct *query_matching_vma(struct mm_struct *mm,
+ unsigned long addr, u32 flags)
+{
+ struct vm_area_struct *vma;
+
+next_vma:
+ vma = query_vma_find_by_addr(mm, addr);
+ if (!vma)
+ goto no_vma;
+
+ /* user requested only file-backed VMA, keep iterating */
+ if ((flags & PROCMAP_QUERY_FILE_BACKED_VMA) && !vma->vm_file)
+ goto skip_vma;
+
+ /* VMA permissions should satisfy query flags */
+ if (flags & PROCMAP_QUERY_VMA_FLAGS) {
+ u32 perm = 0;
+
+ if (flags & PROCMAP_QUERY_VMA_READABLE)
+ perm |= VM_READ;
+ if (flags & PROCMAP_QUERY_VMA_WRITABLE)
+ perm |= VM_WRITE;
+ if (flags & PROCMAP_QUERY_VMA_EXECUTABLE)
+ perm |= VM_EXEC;
+ if (flags & PROCMAP_QUERY_VMA_SHARED)
+ perm |= VM_MAYSHARE;
+
+ if ((vma->vm_flags & perm) != perm)
+ goto skip_vma;
+ }
+
+ /* found covering VMA or user is OK with the matching next VMA */
+ if ((flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA) || vma->vm_start <= addr)
+ return vma;
+
+skip_vma:
+ /*
+ * If the user needs closest matching VMA, keep iterating.
+ */
+ addr = vma->vm_end;
+ if (flags & PROCMAP_QUERY_COVERING_OR_NEXT_VMA)
+ goto next_vma;
+
+no_vma:
+ return ERR_PTR(-ENOENT);
+}
+
+static int do_procmap_query(struct proc_maps_private *priv, void __user *uarg)
+{
+ struct procmap_query karg;
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ const char *name = NULL;
+ char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL;
+ __u64 usize;
+ int err;
+
+ if (copy_from_user(&usize, (void __user *)uarg, sizeof(usize)))
+ return -EFAULT;
+ /* argument struct can never be that large, reject abuse */
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+ /* argument struct should have at least query_flags and query_addr fields */
+ if (usize < offsetofend(struct procmap_query, query_addr))
+ return -EINVAL;
+ err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
+ if (err)
+ return err;
+
+ /* reject unknown flags */
+ if (karg.query_flags & ~PROCMAP_QUERY_VALID_FLAGS_MASK)
+ return -EINVAL;
+ /* either both buffer address and size are set, or both should be zero */
+ if (!!karg.vma_name_size != !!karg.vma_name_addr)
+ return -EINVAL;
+ if (!!karg.build_id_size != !!karg.build_id_addr)
+ return -EINVAL;
+
+ mm = priv->mm;
+ if (!mm || !mmget_not_zero(mm))
+ return -ESRCH;
+
+ err = query_vma_setup(mm);
+ if (err) {
+ mmput(mm);
+ return err;
+ }
+
+ vma = query_matching_vma(mm, karg.query_addr, karg.query_flags);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ vma = NULL;
+ goto out;
+ }
+
+ karg.vma_start = vma->vm_start;
+ karg.vma_end = vma->vm_end;
+
+ karg.vma_flags = 0;
+ if (vma->vm_flags & VM_READ)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_READABLE;
+ if (vma->vm_flags & VM_WRITE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_WRITABLE;
+ if (vma->vm_flags & VM_EXEC)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_EXECUTABLE;
+ if (vma->vm_flags & VM_MAYSHARE)
+ karg.vma_flags |= PROCMAP_QUERY_VMA_SHARED;
+
+ karg.vma_page_size = vma_kernel_pagesize(vma);
+
+ if (vma->vm_file) {
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
+ karg.vma_offset = ((__u64)vma->vm_pgoff) << PAGE_SHIFT;
+ karg.dev_major = MAJOR(inode->i_sb->s_dev);
+ karg.dev_minor = MINOR(inode->i_sb->s_dev);
+ karg.inode = inode->i_ino;
+ } else {
+ karg.vma_offset = 0;
+ karg.dev_major = 0;
+ karg.dev_minor = 0;
+ karg.inode = 0;
+ }
+
+ if (karg.build_id_size) {
+ __u32 build_id_sz;
+
+ err = build_id_parse(vma, build_id_buf, &build_id_sz);
+ if (err) {
+ karg.build_id_size = 0;
+ } else {
+ if (karg.build_id_size < build_id_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.build_id_size = build_id_sz;
+ }
+ }
+
+ if (karg.vma_name_size) {
+ size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size);
+ const struct path *path;
+ const char *name_fmt;
+ size_t name_sz = 0;
+
+ get_vma_name(vma, &path, &name, &name_fmt);
+
+ if (path || name_fmt || name) {
+ name_buf = kmalloc(name_buf_sz, GFP_KERNEL);
+ if (!name_buf) {
+ err = -ENOMEM;
+ goto out;
+ }
+ }
+ if (path) {
+ name = d_path(path, name_buf, name_buf_sz);
+ if (IS_ERR(name)) {
+ err = PTR_ERR(name);
+ goto out;
+ }
+ name_sz = name_buf + name_buf_sz - name;
+ } else if (name || name_fmt) {
+ name_sz = 1 + snprintf(name_buf, name_buf_sz, name_fmt ?: "%s", name);
+ name = name_buf;
+ }
+ if (name_sz > name_buf_sz) {
+ err = -ENAMETOOLONG;
+ goto out;
+ }
+ karg.vma_name_size = name_sz;
+ }
+
+ /* unlock vma or mmap_lock, and put mm_struct before copying data to user */
+ query_vma_teardown(mm, vma);
+ mmput(mm);
+
+ if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr),
+ name, karg.vma_name_size)) {
+ kfree(name_buf);
+ return -EFAULT;
+ }
+ kfree(name_buf);
+
+ if (karg.build_id_size && copy_to_user(u64_to_user_ptr(karg.build_id_addr),
+ build_id_buf, karg.build_id_size))
+ return -EFAULT;
+
+ if (copy_to_user(uarg, &karg, min_t(size_t, sizeof(karg), usize)))
+ return -EFAULT;
+
+ return 0;
+
+out:
+ query_vma_teardown(mm, vma);
+ mmput(mm);
+ kfree(name_buf);
+ return err;
+}
+
+static long procfs_procmap_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct seq_file *seq = file->private_data;
+ struct proc_maps_private *priv = seq->private;
+
+ switch (cmd) {
+ case PROCMAP_QUERY:
+ return do_procmap_query(priv, (void __user *)arg);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
const struct file_operations proc_pid_maps_operations = {
.open = pid_maps_open,
.read = seq_read,
.llseek = seq_lseek,
.release = proc_map_release,
+ .unlocked_ioctl = procfs_procmap_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
};
/*
@@ -442,7 +702,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss,
static void smaps_account(struct mem_size_stats *mss, struct page *page,
bool compound, bool young, bool dirty, bool locked,
- bool migration)
+ bool present)
{
struct folio *folio = page_folio(page);
int i, nr = compound ? compound_nr(page) : 1;
@@ -471,24 +731,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
* Then accumulate quantities that may depend on sharing, or that may
* differ page-by-page.
*
- * refcount == 1 guarantees the page is mapped exactly once.
- * If any subpage of the compound page mapped with PTE it would elevate
- * the refcount.
+ * refcount == 1 for present entries guarantees that the folio is mapped
+ * exactly once. For large folios this implies that exactly one
+ * PTE/PMD/... maps (a part of) this folio.
+ *
+ * Treat all non-present entries (where relying on the mapcount and
+ * refcount doesn't make sense) as "maybe shared, but not sure how
+ * often". We treat device private entries as being fake-present.
*
- * The page_mapcount() is called to get a snapshot of the mapcount.
- * Without holding the page lock this snapshot can be slightly wrong as
- * we cannot always read the mapcount atomically. It is not safe to
- * call page_mapcount() even with PTL held if the page is not mapped,
- * especially for migration entries. Treat regular migration entries
- * as mapcount == 1.
+ * Note that it would not be safe to read the mapcount especially for
+ * pages referenced by migration entries, even with the PTL held.
*/
- if ((folio_ref_count(folio) == 1) || migration) {
+ if (folio_ref_count(folio) == 1 || !present) {
smaps_page_accumulate(mss, folio, size, size << PSS_SHIFT,
- dirty, locked, true);
+ dirty, locked, present);
return;
}
+ /*
+ * We obtain a snapshot of the mapcount. Without holding the folio lock
+ * this snapshot can be slightly wrong as we cannot always read the
+ * mapcount atomically.
+ */
for (i = 0; i < nr; i++, page++) {
- int mapcount = page_mapcount(page);
+ int mapcount = folio_precise_page_mapcount(folio, page);
unsigned long pss = PAGE_SIZE << PSS_SHIFT;
if (mapcount >= 2)
pss /= mapcount;
@@ -531,13 +796,14 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false, young = false, dirty = false;
+ bool present = false, young = false, dirty = false;
pte_t ptent = ptep_get(pte);
if (pte_present(ptent)) {
page = vm_normal_page(vma, addr, ptent);
young = pte_young(ptent);
dirty = pte_dirty(ptent);
+ present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
@@ -555,8 +821,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
}
} else if (is_pfn_swap_entry(swpent)) {
- if (is_migration_entry(swpent))
- migration = true;
+ if (is_device_private_entry(swpent))
+ present = true;
page = pfn_swap_entry_to_page(swpent);
}
} else {
@@ -567,7 +833,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
if (!page)
return;
- smaps_account(mss, page, false, young, dirty, locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, present);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -578,18 +844,17 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
+ bool present = false;
struct folio *folio;
- bool migration = false;
if (pmd_present(*pmd)) {
page = vm_normal_page_pmd(vma, addr, *pmd);
+ present = true;
} else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) {
swp_entry_t entry = pmd_to_swp_entry(*pmd);
- if (is_migration_entry(entry)) {
- migration = true;
+ if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
- }
}
if (IS_ERR_OR_NULL(page))
return;
@@ -604,7 +869,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
mss->file_thp += HPAGE_PMD_SIZE;
smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd),
- locked, migration);
+ locked, present);
}
#else
static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -696,7 +961,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_PKEY_BIT0)] = "",
[ilog2(VM_PKEY_BIT1)] = "",
[ilog2(VM_PKEY_BIT2)] = "",
+#if VM_PKEY_BIT3
[ilog2(VM_PKEY_BIT3)] = "",
+#endif
#if VM_PKEY_BIT4
[ilog2(VM_PKEY_BIT4)] = "",
#endif
@@ -707,6 +974,12 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_X86_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
+#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
+ [ilog2(VM_DROPPABLE)] = "dp",
+#endif
+#ifdef CONFIG_64BIT
+ [ilog2(VM_SEALED)] = "sl",
+#endif
};
size_t i;
@@ -730,19 +1003,23 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
{
struct mem_size_stats *mss = walk->private;
struct vm_area_struct *vma = walk->vma;
- pte_t ptent = huge_ptep_get(pte);
+ pte_t ptent = huge_ptep_get(walk->mm, addr, pte);
struct folio *folio = NULL;
+ bool present = false;
if (pte_present(ptent)) {
folio = page_folio(pte_page(ptent));
+ present = true;
} else if (is_swap_pte(ptent)) {
swp_entry_t swpent = pte_to_swp_entry(ptent);
if (is_pfn_swap_entry(swpent))
folio = pfn_swap_entry_folio(swpent);
}
+
if (folio) {
- if (folio_likely_mapped_shared(folio) ||
+ /* We treat non-present entries as "maybe shared". */
+ if (!present || folio_likely_mapped_shared(folio) ||
hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
@@ -1088,7 +1365,7 @@ struct clear_refs_private {
static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
{
- struct page *page;
+ struct folio *folio;
if (!pte_write(pte))
return false;
@@ -1096,10 +1373,10 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
return false;
if (likely(!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags)))
return false;
- page = vm_normal_page(vma, addr, pte);
- if (!page)
+ folio = vm_normal_folio(vma, addr, pte);
+ if (!folio)
return false;
- return page_maybe_dma_pinned(page);
+ return folio_maybe_dma_pinned(folio);
}
static inline void clear_soft_dirty(struct vm_area_struct *vma,
@@ -1415,7 +1692,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
{
u64 frame = 0, flags = 0;
struct page *page = NULL;
- bool migration = false;
+ struct folio *folio;
if (pte_present(pte)) {
if (pm->show_pfn)
@@ -1447,17 +1724,20 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
(offset << MAX_SWAPFILES_SHIFT);
}
flags |= PM_SWAP;
- migration = is_migration_entry(entry);
if (is_pfn_swap_entry(entry))
page = pfn_swap_entry_to_page(entry);
if (pte_marker_entry_uffd_wp(entry))
flags |= PM_UFFD_WP;
}
- if (page && !PageAnon(page))
- flags |= PM_FILE;
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ if ((flags & PM_PRESENT) &&
+ folio_precise_page_mapcount(folio, page) == 1)
+ flags |= PM_MMAP_EXCLUSIVE;
+ }
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1473,13 +1753,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pte_t *pte, *orig_pte;
int err = 0;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool migration = false;
ptl = pmd_trans_huge_lock(pmdp, vma);
if (ptl) {
+ unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT;
u64 flags = 0, frame = 0;
pmd_t pmd = *pmdp;
struct page *page = NULL;
+ struct folio *folio = NULL;
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1493,8 +1774,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_uffd_wp(pmd))
flags |= PM_UFFD_WP;
if (pm->show_pfn)
- frame = pmd_pfn(pmd) +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ frame = pmd_pfn(pmd) + idx;
}
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
else if (is_swap_pmd(pmd)) {
@@ -1503,11 +1783,9 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pm->show_pfn) {
if (is_pfn_swap_entry(entry))
- offset = swp_offset_pfn(entry);
+ offset = swp_offset_pfn(entry) + idx;
else
- offset = swp_offset(entry);
- offset = offset +
- ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ offset = swp_offset(entry) + idx;
frame = swp_type(entry) |
(offset << MAX_SWAPFILES_SHIFT);
}
@@ -1517,17 +1795,25 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
- migration = is_migration_entry(entry);
page = pfn_swap_entry_to_page(entry);
}
#endif
- if (page && !migration && page_mapcount(page) == 1)
- flags |= PM_MMAP_EXCLUSIVE;
+ if (page) {
+ folio = page_folio(page);
+ if (!folio_test_anon(folio))
+ flags |= PM_FILE;
+ }
+
+ for (; addr != end; addr += PAGE_SIZE, idx++) {
+ unsigned long cur_flags = flags;
+ pagemap_entry_t pme;
- for (; addr != end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(frame, flags);
+ if (folio && (flags & PM_PRESENT) &&
+ folio_precise_page_mapcount(folio, page + idx) == 1)
+ cur_flags |= PM_MMAP_EXCLUSIVE;
+ pme = make_pme(frame, cur_flags);
err = add_to_pagemap(&pme, pm);
if (err)
break;
@@ -1582,7 +1868,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, addr, ptep);
if (pte_present(pte)) {
struct folio *folio = page_folio(pte_page(pte));
@@ -2271,7 +2557,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
if (~p->arg.flags & PM_SCAN_WP_MATCHING) {
/* Go the short route when not write-protecting pages. */
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, start, ptep);
categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
if (!pagemap_scan_is_interesting_page(categories, p))
@@ -2283,7 +2569,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
i_mmap_lock_write(vma->vm_file->f_mapping);
ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
- pte = huge_ptep_get(ptep);
+ pte = huge_ptep_get(walk->mm, start, ptep);
categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
if (!pagemap_scan_is_interesting_page(categories, p))
@@ -2563,7 +2849,7 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
unsigned long nr_pages)
{
struct folio *folio = page_folio(page);
- int count = page_mapcount(page);
+ int count = folio_precise_page_mapcount(folio, page);
md->pages += nr_pages;
if (pte_dirty || folio_test_dirty(folio))
@@ -2679,7 +2965,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
- pte_t huge_pte = huge_ptep_get(pte);
+ pte_t huge_pte = huge_ptep_get(walk->mm, addr, pte);
struct numa_maps *md;
struct page *page;