From c164e038eee805147e95789dddb88ae3b3aca11c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Dec 2014 15:44:36 -0800 Subject: mm: fix huge zero page accounting in smaps report As a small zero page, huge zero page should not be accounted in smaps report as normal page. For small pages we rely on vm_normal_page() to filter out zero page, but vm_normal_page() is not designed to handle pmds. We only get here due hackish cast pmd to pte in smaps_pte_range() -- pte and pmd format is not necessary compatible on each and every architecture. Let's add separate codepath to handle pmds. follow_trans_huge_pmd() will detect huge zero page for us. We would need pmd_dirty() helper to do this properly. The patch adds it to THP-enabled architectures which don't yet have one. [akpm@linux-foundation.org: use do_div to fix 32-bit build] Signed-off-by: "Kirill A. Shutemov" Reported-by: Fengguang Wu Tested-by: Fengwei Yin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 104 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 36 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f6734c6b66a6..246eae84b13b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -447,58 +447,91 @@ struct mem_size_stats { u64 pss; }; +static void smaps_account(struct mem_size_stats *mss, struct page *page, + unsigned long size, bool young, bool dirty) +{ + int mapcount; + + if (PageAnon(page)) + mss->anonymous += size; -static void smaps_pte_entry(pte_t ptent, unsigned long addr, - unsigned long ptent_size, struct mm_walk *walk) + mss->resident += size; + /* Accumulate the size in pages that have been accessed. */ + if (young || PageReferenced(page)) + mss->referenced += size; + mapcount = page_mapcount(page); + if (mapcount >= 2) { + u64 pss_delta; + + if (dirty || PageDirty(page)) + mss->shared_dirty += size; + else + mss->shared_clean += size; + pss_delta = (u64)size << PSS_SHIFT; + do_div(pss_delta, mapcount); + mss->pss += pss_delta; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += size; + else + mss->private_clean += size; + mss->pss += (u64)size << PSS_SHIFT; + } +} + +static void smaps_pte_entry(pte_t *pte, unsigned long addr, + struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; struct vm_area_struct *vma = mss->vma; pgoff_t pgoff = linear_page_index(vma, addr); struct page *page = NULL; - int mapcount; - if (pte_present(ptent)) { - page = vm_normal_page(vma, addr, ptent); - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + if (pte_present(*pte)) { + page = vm_normal_page(vma, addr, *pte); + } else if (is_swap_pte(*pte)) { + swp_entry_t swpent = pte_to_swp_entry(*pte); if (!non_swap_entry(swpent)) - mss->swap += ptent_size; + mss->swap += PAGE_SIZE; else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); - } else if (pte_file(ptent)) { - if (pte_to_pgoff(ptent) != pgoff) - mss->nonlinear += ptent_size; + } else if (pte_file(*pte)) { + if (pte_to_pgoff(*pte) != pgoff) + mss->nonlinear += PAGE_SIZE; } if (!page) return; - if (PageAnon(page)) - mss->anonymous += ptent_size; - if (page->index != pgoff) - mss->nonlinear += ptent_size; + mss->nonlinear += PAGE_SIZE; - mss->resident += ptent_size; - /* Accumulate the size in pages that have been accessed. */ - if (pte_young(ptent) || PageReferenced(page)) - mss->referenced += ptent_size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - if (pte_dirty(ptent) || PageDirty(page)) - mss->shared_dirty += ptent_size; - else - mss->shared_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT) / mapcount; - } else { - if (pte_dirty(ptent) || PageDirty(page)) - mss->private_dirty += ptent_size; - else - mss->private_clean += ptent_size; - mss->pss += (ptent_size << PSS_SHIFT); - } + smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = mss->vma; + struct page *page; + + /* FOLL_DUMP will return -EFAULT on huge zero page */ + page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return; + mss->anonymous_thp += HPAGE_PMD_SIZE; + smaps_account(mss, page, HPAGE_PMD_SIZE, + pmd_young(*pmd), pmd_dirty(*pmd)); } +#else +static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, + struct mm_walk *walk) +{ +} +#endif static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -509,9 +542,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, spinlock_t *ptl; if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { - smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -524,7 +556,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) - smaps_pte_entry(*pte, addr, PAGE_SIZE, walk); + smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); cond_resched(); return 0; -- cgit v1.2.3 From 710585d4922fd315f2cada8fbe550ae8ed23e994 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 10 Dec 2014 15:45:01 -0800 Subject: fs/proc: use a rb tree for the directory entries When a lot of netdevices are created, one of the bottleneck is the creation of proc entries. This serie aims to accelerate this part. The current implementation for the directories in /proc is using a single linked list. This is slow when handling directories with large numbers of entries (eg netdevice-related entries when lots of tunnels are opened). This patch replaces this linked list by a red-black tree. Here are some numbers: dummy30000.batch contains 30 000 times 'link add type dummy'. Before the patch: $ time ip -b dummy30000.batch real 2m31.950s user 0m0.440s sys 2m21.440s $ time rmmod dummy real 1m35.764s user 0m0.000s sys 1m24.088s After the patch: $ time ip -b dummy30000.batch real 2m0.874s user 0m0.448s sys 1m49.720s $ time rmmod dummy real 1m13.988s user 0m0.000s sys 1m1.008s The idea of improving this part was suggested by Thierry Herbelot. [akpm@linux-foundation.org: initialise proc_root.subdir at compile time] Signed-off-by: Nicolas Dichtel Acked-by: David S. Miller Cc: Thierry Herbelot . Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 164 ++++++++++++++++++++++++++++++++++------------------- fs/proc/internal.h | 11 ++-- fs/proc/proc_net.c | 1 + fs/proc/root.c | 1 + 4 files changed, 113 insertions(+), 64 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 317b72641ebf..9f8fa1e5e8aa 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -31,9 +31,81 @@ static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { - if (de->namelen != len) - return 0; - return !memcmp(name, de->name, len); + if (len < de->namelen) + return -1; + if (len > de->namelen) + return 1; + + return memcmp(name, de->name, len); +} + +static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) +{ + struct rb_node *node = rb_first(&dir->subdir); + + if (node == NULL) + return NULL; + + return rb_entry(node, struct proc_dir_entry, subdir_node); +} + +static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) +{ + struct rb_node *node = rb_next(&dir->subdir_node); + + if (node == NULL) + return NULL; + + return rb_entry(node, struct proc_dir_entry, subdir_node); +} + +static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, + const char *name, + unsigned int len) +{ + struct rb_node *node = dir->subdir.rb_node; + + while (node) { + struct proc_dir_entry *de = container_of(node, + struct proc_dir_entry, + subdir_node); + int result = proc_match(len, name, de); + + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return de; + } + return NULL; +} + +static bool pde_subdir_insert(struct proc_dir_entry *dir, + struct proc_dir_entry *de) +{ + struct rb_root *root = &dir->subdir; + struct rb_node **new = &root->rb_node, *parent = NULL; + + /* Figure out where to put new node */ + while (*new) { + struct proc_dir_entry *this = + container_of(*new, struct proc_dir_entry, subdir_node); + int result = proc_match(de->namelen, de->name, this); + + parent = *new; + if (result < 0) + new = &(*new)->rb_left; + else if (result > 0) + new = &(*new)->rb_right; + else + return false; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&de->subdir_node, parent, new); + rb_insert_color(&de->subdir_node, root); + return true; } static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) @@ -92,10 +164,7 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, break; len = next - cp; - for (de = de->subdir; de ; de = de->next) { - if (proc_match(len, cp, de)) - break; - } + de = pde_subdir_find(de, cp, len); if (!de) { WARN(1, "name '%s'\n", name); return -ENOENT; @@ -183,19 +252,16 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct inode *inode; spin_lock(&proc_subdir_lock); - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - pde_get(de); - spin_unlock(&proc_subdir_lock); - inode = proc_get_inode(dir->i_sb, de); - if (!inode) - return ERR_PTR(-ENOMEM); - d_set_d_op(dentry, &simple_dentry_operations); - d_add(dentry, inode); - return NULL; - } + de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); + if (de) { + pde_get(de); + spin_unlock(&proc_subdir_lock); + inode = proc_get_inode(dir->i_sb, de); + if (!inode) + return ERR_PTR(-ENOMEM); + d_set_d_op(dentry, &simple_dentry_operations); + d_add(dentry, inode); + return NULL; } spin_unlock(&proc_subdir_lock); return ERR_PTR(-ENOENT); @@ -225,7 +291,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, return 0; spin_lock(&proc_subdir_lock); - de = de->subdir; + de = pde_subdir_first(de); i = ctx->pos - 2; for (;;) { if (!de) { @@ -234,7 +300,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } if (!i) break; - de = de->next; + de = pde_subdir_next(de); i--; } @@ -249,7 +315,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, } spin_lock(&proc_subdir_lock); ctx->pos++; - next = de->next; + next = pde_subdir_next(de); pde_put(de); de = next; } while (de); @@ -286,9 +352,8 @@ static const struct inode_operations proc_dir_inode_operations = { static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) { - struct proc_dir_entry *tmp; int ret; - + ret = proc_alloc_inum(&dp->low_ino); if (ret) return ret; @@ -308,17 +373,10 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp } spin_lock(&proc_subdir_lock); - - for (tmp = dir->subdir; tmp; tmp = tmp->next) - if (strcmp(tmp->name, dp->name) == 0) { - WARN(1, "proc_dir_entry '%s/%s' already registered\n", - dir->name, dp->name); - break; - } - - dp->next = dir->subdir; dp->parent = dir; - dir->subdir = dp; + if (pde_subdir_insert(dir, dp) == false) + WARN(1, "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); spin_unlock(&proc_subdir_lock); return 0; @@ -354,6 +412,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; + ent->subdir = RB_ROOT; atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -485,7 +544,6 @@ void pde_put(struct proc_dir_entry *pde) */ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *de = NULL; const char *fn = name; unsigned int len; @@ -497,14 +555,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - de = *p; - *p = de->next; - de->next = NULL; - break; - } - } + de = pde_subdir_find(parent, fn, len); + if (de) + rb_erase(&de->subdir_node, &parent->subdir); spin_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -516,16 +569,15 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) if (S_ISDIR(de->mode)) parent->nlink--; de->nlink = 0; - WARN(de->subdir, "%s: removing non-empty directory " - "'%s/%s', leaking at least '%s'\n", __func__, - de->parent->name, de->name, de->subdir->name); + WARN(pde_subdir_first(de), + "%s: removing non-empty directory '%s/%s', leaking at least '%s'\n", + __func__, de->parent->name, de->name, pde_subdir_first(de)->name); pde_put(de); } EXPORT_SYMBOL(remove_proc_entry); int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { - struct proc_dir_entry **p; struct proc_dir_entry *root = NULL, *de, *next; const char *fn = name; unsigned int len; @@ -537,24 +589,18 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) } len = strlen(fn); - for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (proc_match(len, fn, *p)) { - root = *p; - *p = root->next; - root->next = NULL; - break; - } - } + root = pde_subdir_find(parent, fn, len); if (!root) { spin_unlock(&proc_subdir_lock); return -ENOENT; } + rb_erase(&root->subdir_node, &parent->subdir); + de = root; while (1) { - next = de->subdir; + next = pde_subdir_first(de); if (next) { - de->subdir = next->next; - next->next = NULL; + rb_erase(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa7a0ee182e1..7fb1a4869fd0 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -24,10 +24,9 @@ struct mempolicy; * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * - * The "next" pointer creates a linked list of one /proc directory, - * while parent/subdir create the directory structure (every - * /proc file has a parent, but "subdir" is NULL for all - * non-directory entries). + * parent/subdir are used for the directory structure (every /proc file has a + * parent, but "subdir" is empty for all non-directory entries). + * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { unsigned int low_ino; @@ -38,7 +37,9 @@ struct proc_dir_entry { loff_t size; const struct inode_operations *proc_iops; const struct file_operations *proc_fops; - struct proc_dir_entry *next, *parent, *subdir; + struct proc_dir_entry *parent; + struct rb_root subdir; + struct rb_node subdir_node; void *data; atomic_t count; /* use count */ atomic_t in_use; /* number of callers into module in progress; */ diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a63af3e0a612..1bde894bc624 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -192,6 +192,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; + netd->subdir = RB_ROOT; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index 094e44d4a6be..e74ac9f1a2c0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -251,6 +251,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, + .subdir = RB_ROOT, .name = "/proc", }; -- cgit v1.2.3 From b208d54b75399b276b05f9e70cce8d3a59a42547 Mon Sep 17 00:00:00 2001 From: Debabrata Banerjee Date: Wed, 10 Dec 2014 15:45:04 -0800 Subject: procfs: fix error handling of proc_register() proc_register() error paths are leaking inodes and directory refcounts. Signed-off-by: Debabrata Banerjee Cc: Alexander Viro Acked-by: Nicolas Dichtel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9f8fa1e5e8aa..be39c6feb3e5 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -369,14 +369,21 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp dp->proc_iops = &proc_file_inode_operations; } else { WARN_ON(1); + proc_free_inum(dp->low_ino); return -EINVAL; } spin_lock(&proc_subdir_lock); dp->parent = dir; - if (pde_subdir_insert(dir, dp) == false) + if (pde_subdir_insert(dir, dp) == false) { WARN(1, "proc_dir_entry '%s/%s' already registered\n", dir->name, dp->name); + spin_unlock(&proc_subdir_lock); + if (S_ISDIR(dp->mode)) + dir->nlink--; + proc_free_inum(dp->low_ino); + return -EEXIST; + } spin_unlock(&proc_subdir_lock); return 0; -- cgit v1.2.3 From 2fc1e948e820bddf8a686c6e2989219b471d7982 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 10 Dec 2014 15:45:07 -0800 Subject: fs/proc.c: use rb_entry_safe() instead of rb_entry() Better to use existing macro that rewriting them. Signed-off-by: Nicolas Dichtel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index be39c6feb3e5..7fea13229f33 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -41,22 +41,14 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - struct rb_node *node = rb_first(&dir->subdir); - - if (node == NULL) - return NULL; - - return rb_entry(node, struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) { - struct rb_node *node = rb_next(&dir->subdir_node); - - if (node == NULL) - return NULL; - - return rb_entry(node, struct proc_dir_entry, subdir_node); + return rb_entry_safe(rb_next(&dir->subdir_node), struct proc_dir_entry, + subdir_node); } static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, -- cgit v1.2.3 From 4af1036df4dd4f0d59fad9d82ed456bfa2e73fa6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:10 -0800 Subject: proc: task_state: read cred->group_info outside of task_lock() task_state() reads cred->group_info under task_lock() because a long ago it was task_struct->group_info and it was actually protected by task->alloc_lock. Today this task_unlock() after rcu_read_unlock() just adds the confusion, move task_unlock() up. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index cd3653e4f35c..b5810c228c10 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -201,11 +201,10 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "FDSize:\t%d\n" "Groups:\t", fdt ? fdt->max_fds : 0); + task_unlock(p); rcu_read_unlock(); group_info = cred->group_info; - task_unlock(p); - for (g = 0; g < group_info->ngroups; g++) seq_printf(m, "%d ", from_kgid_munged(user_ns, GROUP_AT(group_info, g))); -- cgit v1.2.3 From 0f4a0d53f20c76a70cb5b69b6aa237de2107e171 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:12 -0800 Subject: proc: task_state: deuglify the max_fds calculation 1. The usage of fdt looks very ugly, it can't be NULL if ->files is not NULL. We can use "unsigned int max_fds" instead. 2. This also allows to move seq_printf(max_fds) outside of task_lock() and join it with the previous seq_printf(). See also the next patch. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index b5810c228c10..7c8d9aecb070 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,9 +157,9 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; - struct fdtable *fdt = NULL; const struct cred *cred; pid_t ppid, tpid; + unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? @@ -171,6 +171,12 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, tpid = task_pid_nr_ns(tracer, ns); } cred = get_task_cred(p); + + task_lock(p); + if (p->files) + max_fds = files_fdtable(p->files)->max_fds; + task_unlock(p); + seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" @@ -179,7 +185,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "PPid:\t%d\n" "TracerPid:\t%d\n" "Uid:\t%d\t%d\t%d\t%d\n" - "Gid:\t%d\t%d\t%d\t%d\n", + "Gid:\t%d\t%d\t%d\t%d\n" + "FDSize:\t%d\nGroups:\t", get_task_state(p), task_tgid_nr_ns(p, ns), task_numa_group_id(p), @@ -192,16 +199,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, cred->gid), from_kgid_munged(user_ns, cred->egid), from_kgid_munged(user_ns, cred->sgid), - from_kgid_munged(user_ns, cred->fsgid)); - - task_lock(p); - if (p->files) - fdt = files_fdtable(p->files); - seq_printf(m, - "FDSize:\t%d\n" - "Groups:\t", - fdt ? fdt->max_fds : 0); - task_unlock(p); + from_kgid_munged(user_ns, cred->fsgid), + max_fds); rcu_read_unlock(); group_info = cred->group_info; -- cgit v1.2.3 From b0fafc11115938a3215723f37e8e9cc6a94b05b0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:15 -0800 Subject: proc: task_state: move the main seq_printf() outside of rcu_read_lock() task_state() does seq_printf() under rcu_read_lock(), but this is only needed for task_tgid_nr_ns() and task_numa_group_id(). We can calculate tgid/ngid and drop rcu lock. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Reviewed-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 7c8d9aecb070..800e30f8f284 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -158,7 +158,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct group_info *group_info; int g; const struct cred *cred; - pid_t ppid, tpid; + pid_t ppid, tpid, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); @@ -170,12 +170,16 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } + + tgid = task_tgid_nr_ns(p, ns); + ngid = task_numa_group_id(p); cred = get_task_cred(p); task_lock(p); if (p->files) max_fds = files_fdtable(p->files)->max_fds; task_unlock(p); + rcu_read_unlock(); seq_printf(m, "State:\t%s\n" @@ -188,10 +192,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, "Gid:\t%d\t%d\t%d\t%d\n" "FDSize:\t%d\nGroups:\t", get_task_state(p), - task_tgid_nr_ns(p, ns), - task_numa_group_id(p), - pid_nr_ns(pid, ns), - ppid, tpid, + tgid, ngid, pid_nr_ns(pid, ns), ppid, tpid, from_kuid_munged(user_ns, cred->uid), from_kuid_munged(user_ns, cred->euid), from_kuid_munged(user_ns, cred->suid), @@ -201,7 +202,6 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, cred->sgid), from_kgid_munged(user_ns, cred->fsgid), max_fds); - rcu_read_unlock(); group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) -- cgit v1.2.3 From abdba6e9ea6d3903c2b0618db720e17b3c1c705c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:45:18 -0800 Subject: proc: task_state: ptrace_parent() doesn't need pid_alive() check p->ptrace != 0 means that release_task(p) was not called, so pid_alive() buys nothing and we can remove this check. Other callers already use it directly without additional checks. Note: with or without this patch ptrace_parent() can return the pointer to the freed task, this will be explained/fixed later. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: Alexey Dobriyan Cc: "Eric W. Biederman" , Cc: Sterling Alexander Cc: Peter Zijlstra Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/array.c b/fs/proc/array.c index 800e30f8f284..bd117d065b82 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -157,19 +157,18 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct user_namespace *user_ns = seq_user_ns(m); struct group_info *group_info; int g; + struct task_struct *tracer; const struct cred *cred; - pid_t ppid, tpid, tgid, ngid; + pid_t ppid, tpid = 0, tgid, ngid; unsigned int max_fds = 0; rcu_read_lock(); ppid = pid_alive(p) ? task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0; - tpid = 0; - if (pid_alive(p)) { - struct task_struct *tracer = ptrace_parent(p); - if (tracer) - tpid = task_pid_nr_ns(tracer, ns); - } + + tracer = ptrace_parent(p); + if (tracer) + tpid = task_pid_nr_ns(tracer, ns); tgid = task_tgid_nr_ns(p, ns); ngid = task_numa_group_id(p); -- cgit v1.2.3 From c35a7f18a0b237261dad57c9abd3adfa73f315e1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:54:56 -0800 Subject: exit: proc: don't try to flush /proc/tgid/task/tgid proc_flush_task_mnt() always tries to flush task/pid, but this is pointless if we reap the leader. d_invalidate() is recursive, and if nothing else the next d_hash_and_lookup(tgid) should fail anyway. Signed-off-by: Oleg Nesterov Cc: Aaron Tomlin Cc: "Eric W. Biederman" Cc: Rik van Riel Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index 772efa45a452..e7b04a321cc1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2618,6 +2618,9 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) dput(dentry); } + if (pid == tgid) + return; + name.name = buf; name.len = snprintf(buf, sizeof(buf), "%d", tgid); leader = d_hash_and_lookup(mnt->mnt_root, &name); -- cgit v1.2.3