From 3f10c2fa40e444b8cacf82adcbbcd3602b99a645 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 14 Dec 2020 19:03:31 -0800 Subject: fs/ntfs: remove unused varibles We actually don't use these varibles, so remove them to avoid gcc warning: fs/ntfs/file.c:326:14: warning: variable `base_ni' set but not used [-Wunused-but-set-variable] fs/ntfs/logfile.c:481:21: warning: variable `log_page_mask' set but not used [-Wunused-but-set-variable] Link: https://lkml.kernel.org/r/1604821092-54631-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/file.c | 5 +---- fs/ntfs/logfile.c | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f42967b738eb..e5aab265dff1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -323,7 +323,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, unsigned long flags; struct file *file = iocb->ki_filp; struct inode *vi = file_inode(file); - ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_inode *ni = NTFS_I(vi); ntfs_volume *vol = ni->vol; ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " @@ -365,9 +365,6 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, err = -EOPNOTSUPP; goto out; } - base_ni = ni; - if (NInoAttr(ni)) - base_ni = ni->ext.base_ntfs_ino; err = file_remove_privs(file); if (unlikely(err)) goto out; diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index a0c40f1be7ac..bc1bf217b38e 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -478,7 +478,7 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) u8 *kaddr = NULL; RESTART_PAGE_HEADER *rstr1_ph = NULL; RESTART_PAGE_HEADER *rstr2_ph = NULL; - int log_page_size, log_page_mask, err; + int log_page_size, err; bool logfile_is_empty = true; u8 log_page_bits; @@ -501,7 +501,6 @@ bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp) log_page_size = DefaultLogPageSize; else log_page_size = PAGE_SIZE; - log_page_mask = log_page_size - 1; /* * Use ntfs_ffs() instead of ffs() to enable the compiler to * optimize log_page_size and log_page_bits into constants. -- cgit v1.2.3 From 4dad18f47767f97f859fad84a8c2c8ee8323c2b9 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 14 Dec 2020 19:03:34 -0800 Subject: fs/ntfs: remove unused variable attr_len This variable isn't used anymore, remove it to skip W=1 warning: fs/ntfs/inode.c:2350:6: warning: variable `attr_len' set but not used [-Wunused-but-set-variable] Link: https://lkml.kernel.org/r/4194376f-898b-b602-81c3-210567712092@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index caf563981532..f7e4cbc26eaf 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -2347,7 +2347,6 @@ int ntfs_truncate(struct inode *vi) ATTR_RECORD *a; const char *te = " Leaving file length out of sync with i_size."; int err, mp_size, size_change, alloc_change; - u32 attr_len; ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); BUG_ON(NInoAttr(ni)); @@ -2721,7 +2720,6 @@ do_non_resident_truncate: * this cannot fail since we are making the attribute smaller thus by * definition there is enough space to do so. */ - attr_len = le32_to_cpu(a->length); err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(a->data.non_resident.mapping_pairs_offset)); BUG_ON(err); -- cgit v1.2.3 From a0823b5e4434d349c92ec5f7cec0c6e98788d9b6 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Mon, 14 Dec 2020 19:03:37 -0800 Subject: fs/ocfs2/cluster/tcp.c: remove unneeded break A break is not needed if it is preceded by a goto Link: https://lkml.kernel.org/r/20201019175216.2329-1-trix@redhat.com Signed-off-by: Tom Rix Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 79a231719460..3bd8119bed5e 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1198,7 +1198,6 @@ static int o2net_process_message(struct o2net_sock_container *sc, msglog(hdr, "bad magic\n"); ret = -EINVAL; goto out; - break; } /* find a handler for it */ -- cgit v1.2.3 From 45680967ee29e67b62e6800a8780440b840a0b1f Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Mon, 14 Dec 2020 19:03:40 -0800 Subject: ocfs2: ratelimit the 'max lookup times reached' notice Running stress-ng on ocfs2 completely fills the kernel log with 'max lookup times reached, filesystem may have nested directories.' Let's ratelimit this message as done with others in the code. Test-case: # mkfs.ocfs2 --mount local $DEV # mount $DEV $MNT # cd $MNT # dmesg -C # stress-ng --dirdeep 1 --dirdeep-ops 1000 # dmesg | grep -c 'max lookup times reached' Before: # dmesg -C # stress-ng --dirdeep 1 --dirdeep-ops 1000 ... stress-ng: info: [11116] successful run completed in 3.03s # dmesg | grep -c 'max lookup times reached' 967 After: # dmesg -C # stress-ng --dirdeep 1 --dirdeep-ops 1000 ... stress-ng: info: [739] successful run completed in 0.96s # dmesg | grep -c 'max lookup times reached' 10 # dmesg [ 259.086086] ocfs2_check_if_ancestor: 1990 callbacks suppressed [ 259.086092] (stress-ng-dirde,740,1):ocfs2_check_if_ancestor:1091 max lookup times reached, filesystem may have nested directories, src inode: 18007, dest inode: 17940. ... Link: https://lkml.kernel.org/r/20201001224417.478263-1-mfo@canonical.com Signed-off-by: Mauricio Faria de Oliveira Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index c46bf7f581a1..2a237ab00453 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1088,8 +1088,8 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, child_inode_no = parent_inode_no; if (++i >= MAX_LOOKUP_TIMES) { - mlog(ML_NOTICE, "max lookup times reached, filesystem " - "may have nested directories, " + mlog_ratelimited(ML_NOTICE, "max lookup times reached, " + "filesystem may have nested directories, " "src inode: %llu, dest inode: %llu.\n", (unsigned long long)src_inode_no, (unsigned long long)dest_inode_no); -- cgit v1.2.3 From f0c0c115fb81940f4dba0644ac2a8a43b39c83f3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Mon, 14 Dec 2020 19:07:17 -0800 Subject: mm: memcontrol: account pagetables per node For many workloads, pagetable consumption is significant and it makes sense to expose it in the memory.stat for the memory cgroups. However at the moment, the pagetables are accounted per-zone. Converting them to per-node and using the right interface will correctly account for the memory cgroups as well. [akpm@linux-foundation.org: export __mod_lruvec_page_state to modules for arch/mips/kvm/] Link: https://lkml.kernel.org/r/20201130212541.2781790-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 3 +++ arch/nds32/mm/mm-nds32.c | 6 +++--- drivers/base/node.c | 2 +- fs/proc/meminfo.c | 2 +- include/linux/mm.h | 8 ++++---- include/linux/mmzone.h | 2 +- mm/memcontrol.c | 2 ++ mm/page_alloc.c | 6 +++--- mm/vmstat.c | 2 +- 9 files changed, 19 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 515bb13084a0..63521cd36ce5 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1274,6 +1274,9 @@ PAGE_SIZE multiple when read back. kernel_stack Amount of memory allocated to kernel stacks. + pagetables + Amount of memory allocated for page tables. + percpu(npn) Amount of memory used for storing per-cpu kernel data structures. diff --git a/arch/nds32/mm/mm-nds32.c b/arch/nds32/mm/mm-nds32.c index 55bec50ccc03..f2778f2b39f6 100644 --- a/arch/nds32/mm/mm-nds32.c +++ b/arch/nds32/mm/mm-nds32.c @@ -34,8 +34,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) cpu_dcache_wb_range((unsigned long)new_pgd, (unsigned long)new_pgd + PTRS_PER_PGD * sizeof(pgd_t)); - inc_zone_page_state(virt_to_page((unsigned long *)new_pgd), - NR_PAGETABLE); + inc_lruvec_page_state(virt_to_page((unsigned long *)new_pgd), + NR_PAGETABLE); return new_pgd; } @@ -59,7 +59,7 @@ void pgd_free(struct mm_struct *mm, pgd_t * pgd) pte = pmd_page(*pmd); pmd_clear(pmd); - dec_zone_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE); + dec_lruvec_page_state(virt_to_page((unsigned long *)pgd), NR_PAGETABLE); pte_free(mm, pte); mm_dec_nr_ptes(mm); pmd_free(mm, pmd); diff --git a/drivers/base/node.c b/drivers/base/node.c index 6ffa470e2984..04f71c7bc3f8 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -450,7 +450,7 @@ static ssize_t node_read_meminfo(struct device *dev, #ifdef CONFIG_SHADOW_CALL_STACK nid, node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif - nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), + nid, K(node_page_state(pgdat, NR_PAGETABLE)), nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 887a5532e449..d6fc74619625 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -107,7 +107,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", - global_zone_page_state(NR_PAGETABLE)); + global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..5bbbf4aeee94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2203,7 +2203,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) if (!ptlock_init(page)) return false; __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + inc_lruvec_page_state(page, NR_PAGETABLE); return true; } @@ -2211,7 +2211,7 @@ static inline void pgtable_pte_page_dtor(struct page *page) { ptlock_free(page); __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + dec_lruvec_page_state(page, NR_PAGETABLE); } #define pte_offset_map_lock(mm, pmd, address, ptlp) \ @@ -2298,7 +2298,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) if (!pmd_ptlock_init(page)) return false; __SetPageTable(page); - inc_zone_page_state(page, NR_PAGETABLE); + inc_lruvec_page_state(page, NR_PAGETABLE); return true; } @@ -2306,7 +2306,7 @@ static inline void pgtable_pmd_page_dtor(struct page *page) { pmd_ptlock_free(page); __ClearPageTable(page); - dec_zone_page_state(page, NR_PAGETABLE); + dec_lruvec_page_state(page, NR_PAGETABLE); } /* diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fb3bf696c05e..cca2a4443c9c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -152,7 +152,6 @@ enum zone_stat_item { NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ - NR_PAGETABLE, /* used for pagetables */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) @@ -207,6 +206,7 @@ enum node_stat_item { #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif + NR_PAGETABLE, /* used for pagetables */ NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 52837d68bbec..b9419a3605eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -869,6 +869,7 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } +EXPORT_SYMBOL(__mod_lruvec_page_state); void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { @@ -1493,6 +1494,7 @@ static struct memory_stat memory_stats[] = { { "anon", PAGE_SIZE, NR_ANON_MAPPED }, { "file", PAGE_SIZE, NR_FILE_PAGES }, { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, + { "pagetables", PAGE_SIZE, NR_PAGETABLE }, { "percpu", 1, MEMCG_PERCPU_B }, { "sock", PAGE_SIZE, MEMCG_SOCK }, { "shmem", PAGE_SIZE, NR_SHMEM }, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa227a479e4..743fb2bccecc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5465,7 +5465,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_zone_page_state(NR_PAGETABLE), + global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -5497,6 +5497,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif + " pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5522,6 +5523,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif + K(node_page_state(pgdat, NR_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5553,7 +5555,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" @@ -5574,7 +5575,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d1..da36e3b0aab2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1157,7 +1157,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_page_table_pages", "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1215,6 +1214,7 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) "nr_shadow_call_stack", #endif + "nr_page_table_pages", /* enum writeback_stat_item counters */ "nr_dirty_threshold", -- cgit v1.2.3 From cd544fd1dc9293c6702fab6effa63dac1cc67e99 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 14 Dec 2020 19:08:13 -0800 Subject: mremap: don't allow MREMAP_DONTUNMAP on special_mappings and aio As kernel expect to see only one of such mappings, any further operations on the VMA-copy may be unexpected by the kernel. Maybe it's being on the safe side, but there doesn't seem to be any expected use-case for this, so restrict it now. Link: https://lkml.kernel.org/r/20201013013416.390574-4-dima@arista.com Fixes: commit e346b3813067 ("mm/mremap: add MREMAP_DONTUNMAP to mremap()") Signed-off-by: Dmitry Safonov Cc: Alexander Viro Cc: Andy Lutomirski Cc: Brian Geffon Cc: Catalin Marinas Cc: Dan Carpenter Cc: Dan Williams Cc: Dave Jiang Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Minchan Kim Cc: Ralph Campbell Cc: Russell King Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vishal Verma Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- fs/aio.c | 5 ++++- include/linux/mm.h | 2 +- mm/mmap.c | 6 +++++- mm/mremap.c | 2 +- 5 files changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 0daf2f1cf7a8..e916646adc69 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -1458,7 +1458,7 @@ static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) return 0; } -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +static int pseudo_lock_dev_mremap(struct vm_area_struct *area, unsigned long flags) { /* Not supported */ return -EINVAL; diff --git a/fs/aio.c b/fs/aio.c index 6a21d8919409..fd2d68532506 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -324,13 +324,16 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma) +static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/include/linux/mm.h b/include/linux/mm.h index 1d8e84bf718a..dacc889744b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -558,7 +558,7 @@ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); - int (*mremap)(struct vm_area_struct * area); + int (*mremap)(struct vm_area_struct *area, unsigned long flags); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); diff --git a/mm/mmap.c b/mm/mmap.c index 5c8b4485860d..8950c20239a0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3405,10 +3405,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma) +static int special_mapping_mremap(struct vm_area_struct *new_vma, + unsigned long flags) { struct vm_special_mapping *sm = new_vma->vm_private_data; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; diff --git a/mm/mremap.c b/mm/mremap.c index 90ff09c09e84..366b3dea992c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -534,7 +534,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma); + err = vma->vm_ops->mremap(new_vma, flags); } if (unlikely(err)) { -- cgit v1.2.3 From 5e545df3292fbd3d5963c68980f1527ead2a2b3f Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:09:55 -0800 Subject: arm: remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL ARM is the only architecture that defines CONFIG_ARCH_HAS_HOLES_MEMORYMODEL which in turn enables memmap_valid_within() function that is intended to verify existence of struct page associated with a pfn when there are holes in the memory map. However, the ARCH_HAS_HOLES_MEMORYMODEL also enables HAVE_ARCH_PFN_VALID and arch-specific pfn_valid() implementation that also deals with the holes in the memory map. The only two users of memmap_valid_within() call this function after a call to pfn_valid() so the memmap_valid_within() check becomes redundant. Remove CONFIG_ARCH_HAS_HOLES_MEMORYMODEL and memmap_valid_within() and rely entirely on ARM's implementation of pfn_valid() that is now enabled unconditionally. Link: https://lkml.kernel.org/r/20201101170454.9567-9-rppt@kernel.org Signed-off-by: Mike Rapoport Cc: Alexey Dobriyan Cc: Catalin Marinas Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: John Paul Adrian Glaubitz Cc: Jonathan Corbet Cc: Matt Turner Cc: Meelis Roos Cc: Michael Schmitz Cc: Russell King Cc: Tony Luck Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/memory-model.rst | 3 +-- arch/arm/Kconfig | 8 ++------ arch/arm/mach-bcm/Kconfig | 1 - arch/arm/mach-davinci/Kconfig | 1 - arch/arm/mach-exynos/Kconfig | 1 - arch/arm/mach-highbank/Kconfig | 1 - arch/arm/mach-omap2/Kconfig | 1 - arch/arm/mach-s5pv210/Kconfig | 1 - arch/arm/mach-tango/Kconfig | 1 - fs/proc/kcore.c | 2 -- include/linux/mmzone.h | 31 ------------------------------- mm/mmzone.c | 14 -------------- mm/vmstat.c | 4 ---- 13 files changed, 3 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/Documentation/vm/memory-model.rst b/Documentation/vm/memory-model.rst index 9daadf9faba1..ce398a7dc6cd 100644 --- a/Documentation/vm/memory-model.rst +++ b/Documentation/vm/memory-model.rst @@ -51,8 +51,7 @@ call :c:func:`free_area_init` function. Yet, the mappings array is not usable until the call to :c:func:`memblock_free_all` that hands all the memory to the page allocator. -If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option, -it may free parts of the `mem_map` array that do not cover the +An architecture may free parts of the `mem_map` array that do not cover the actual physical pages. In such case, the architecture specific :c:func:`pfn_valid` implementation should take the holes in the `mem_map` into account. diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 002e0cf025f5..353c3979a2d5 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -25,7 +25,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC + select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX @@ -520,7 +520,6 @@ config ARCH_S3C24XX config ARCH_OMAP1 bool "TI OMAP1" depends on MMU - select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_OMAP select CLKDEV_LOOKUP select CLKSRC_MMIO @@ -1480,9 +1479,6 @@ config OABI_COMPAT UNPREDICTABLE (in fact it can be predicted that it won't work at all). If in doubt say N. -config ARCH_HAS_HOLES_MEMORYMODEL - bool - config ARCH_SELECT_MEMORY_MODEL bool @@ -1494,7 +1490,7 @@ config ARCH_SPARSEMEM_ENABLE select SPARSEMEM_STATIC if SPARSEMEM config HAVE_ARCH_PFN_VALID - def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM + def_bool y config HIGHMEM bool "High Memory Support" diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig index ae790908fc74..9b594ae98153 100644 --- a/arch/arm/mach-bcm/Kconfig +++ b/arch/arm/mach-bcm/Kconfig @@ -211,7 +211,6 @@ config ARCH_BRCMSTB select BCM7038_L1_IRQ select BRCMSTB_L2_IRQ select BCM7120_L2_IRQ - select ARCH_HAS_HOLES_MEMORYMODEL select ZONE_DMA if ARM_LPAE select SOC_BRCMSTB select SOC_BUS diff --git a/arch/arm/mach-davinci/Kconfig b/arch/arm/mach-davinci/Kconfig index f56ff8c24043..de11030748d0 100644 --- a/arch/arm/mach-davinci/Kconfig +++ b/arch/arm/mach-davinci/Kconfig @@ -5,7 +5,6 @@ menuconfig ARCH_DAVINCI depends on ARCH_MULTI_V5 select DAVINCI_TIMER select ZONE_DMA - select ARCH_HAS_HOLES_MEMORYMODEL select PM_GENERIC_DOMAINS if PM select PM_GENERIC_DOMAINS_OF if PM && OF select REGMAP_MMIO diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig index d2d249706ebb..56d272967fc0 100644 --- a/arch/arm/mach-exynos/Kconfig +++ b/arch/arm/mach-exynos/Kconfig @@ -8,7 +8,6 @@ menuconfig ARCH_EXYNOS bool "Samsung Exynos" depends on ARCH_MULTI_V7 - select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_SUPPORTS_BIG_ENDIAN select ARM_AMBA select ARM_GIC diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig index 1bc68913d62c..9de38ce8124f 100644 --- a/arch/arm/mach-highbank/Kconfig +++ b/arch/arm/mach-highbank/Kconfig @@ -2,7 +2,6 @@ config ARCH_HIGHBANK bool "Calxeda ECX-1000/2000 (Highbank/Midway)" depends on ARCH_MULTI_V7 - select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_SUPPORTS_BIG_ENDIAN select ARM_AMBA select ARM_ERRATA_764369 if SMP diff --git a/arch/arm/mach-omap2/Kconfig b/arch/arm/mach-omap2/Kconfig index 3f62a0c9450d..164985505f9e 100644 --- a/arch/arm/mach-omap2/Kconfig +++ b/arch/arm/mach-omap2/Kconfig @@ -93,7 +93,6 @@ config SOC_DRA7XX config ARCH_OMAP2PLUS bool select ARCH_HAS_BANDGAP - select ARCH_HAS_HOLES_MEMORYMODEL select ARCH_HAS_RESET_CONTROLLER select ARCH_OMAP select CLKSRC_MMIO diff --git a/arch/arm/mach-s5pv210/Kconfig b/arch/arm/mach-s5pv210/Kconfig index 95d4e8284866..d644b45bc29d 100644 --- a/arch/arm/mach-s5pv210/Kconfig +++ b/arch/arm/mach-s5pv210/Kconfig @@ -8,7 +8,6 @@ config ARCH_S5PV210 bool "Samsung S5PV210/S5PC110" depends on ARCH_MULTI_V7 - select ARCH_HAS_HOLES_MEMORYMODEL select ARM_VIC select CLKSRC_SAMSUNG_PWM select COMMON_CLK_SAMSUNG diff --git a/arch/arm/mach-tango/Kconfig b/arch/arm/mach-tango/Kconfig index 25b2fd434861..a9eeda36aeb1 100644 --- a/arch/arm/mach-tango/Kconfig +++ b/arch/arm/mach-tango/Kconfig @@ -3,7 +3,6 @@ config ARCH_TANGO bool "Sigma Designs Tango4 (SMP87xx)" depends on ARCH_MULTI_V7 # Cortex-A9 MPCore r3p0, PL310 r3p2 - select ARCH_HAS_HOLES_MEMORYMODEL select ARM_ERRATA_754322 select ARM_ERRATA_764369 if SMP select ARM_ERRATA_775420 diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e502414b3556..4d2e64e9016c 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -193,8 +193,6 @@ kclist_add_private(unsigned long pfn, unsigned long nr_pages, void *arg) return 1; p = pfn_to_page(pfn); - if (!memmap_valid_within(pfn, p, page_zone(p))) - return 1; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 16fb5522a74f..6e0025b5a88f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1440,37 +1440,6 @@ void sparse_init(void); #define pfn_valid_within(pfn) (1) #endif -#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -/* - * pfn_valid() is meant to be able to tell if a given PFN has valid memmap - * associated with it or not. This means that a struct page exists for this - * pfn. The caller cannot assume the page is fully initialized in general. - * Hotplugable pages might not have been onlined yet. pfn_to_online_page() - * will ensure the struct page is fully online and initialized. Special pages - * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. - * - * In FLATMEM, it is expected that holes always have valid memmap as long as - * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed - * that a valid section has a memmap for the entire section. - * - * However, an ARM, and maybe other embedded architectures in the future - * free memmap backing holes to save memory on the assumption the memmap is - * never used. The page_zone linkages are then broken even though pfn_valid() - * returns true. A walker of the full memmap must then do this additional - * check to ensure the memmap they are looking at is sane by making sure - * the zone and PFN linkages are still valid. This is expensive, but walkers - * of the full memmap are extremely rare. - */ -bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone); -#else -static inline bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone) -{ - return true; -} -#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */ diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..f337831affc2 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,20 +72,6 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, return z; } -#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone) -{ - if (page_to_pfn(page) != pfn) - return false; - - if (page_zone(page) != zone) - return false; - - return true; -} -#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; diff --git a/mm/vmstat.c b/mm/vmstat.c index da36e3b0aab2..f8942160fc95 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1503,10 +1503,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!page) continue; - /* Watch for unexpected holes punched in the memmap */ - if (!memmap_valid_within(pfn, page, zone)) - continue; - if (page_zone(page) != zone) continue; -- cgit v1.2.3 From 37cd0575b8510159992d279c530c05f872990b02 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Mon, 14 Dec 2020 19:13:49 -0800 Subject: userfaultfd: add UFFD_USER_MODE_ONLY Patch series "Control over userfaultfd kernel-fault handling", v6. This patch series is split from [1]. The other series enables SELinux support for userfaultfd file descriptors so that its creation and movement can be controlled. It has been demonstrated on various occasions that suspending kernel code execution for an arbitrary amount of time at any access to userspace memory (copy_from_user()/copy_to_user()/...) can be exploited to change the intended behavior of the kernel. For instance, handling page faults in kernel-mode using userfaultfd has been exploited in [2, 3]. Likewise, FUSE, which is similar to userfaultfd in this respect, has been exploited in [4, 5] for similar outcome. This small patch series adds a new flag to userfaultfd(2) that allows callers to give up the ability to handle kernel-mode faults with the resulting UFFD file object. It then adds a 'user-mode only' option to the unprivileged_userfaultfd sysctl knob to require unprivileged callers to use this new flag. The purpose of this new interface is to decrease the chance of an unprivileged userfaultfd user taking advantage of userfaultfd to enhance security vulnerabilities by lengthening the race window in kernel code. [1] https://lore.kernel.org/lkml/20200211225547.235083-1-dancol@google.com/ [2] https://duasynt.com/blog/linux-kernel-heap-spray [3] https://duasynt.com/blog/cve-2016-6187-heap-off-by-one-exploit [4] https://googleprojectzero.blogspot.com/2016/06/exploiting-recursion-in-linux-kernel_20.html [5] https://bugs.chromium.org/p/project-zero/issues/detail?id=808 This patch (of 2): userfaultfd handles page faults from both user and kernel code. Add a new UFFD_USER_MODE_ONLY flag for userfaultfd(2) that makes the resulting userfaultfd object refuse to handle faults from kernel mode, treating these faults as if SIGBUS were always raised, causing the kernel code to fail with EFAULT. A future patch adds a knob allowing administrators to give some processes the ability to create userfaultfd file objects only if they pass UFFD_USER_MODE_ONLY, reducing the likelihood that these processes will exploit userfaultfd's ability to delay kernel page faults to open timing windows for future exploits. Link: https://lkml.kernel.org/r/20201120030411.2690816-1-lokeshgidra@google.com Link: https://lkml.kernel.org/r/20201120030411.2690816-2-lokeshgidra@google.com Signed-off-by: Daniel Colascione Signed-off-by: Lokesh Gidra Reviewed-by: Andrea Arcangeli Cc: Alexander Viro Cc: Cc: Daniel Colascione Cc: Eric Biggers Cc: Iurii Zaikin Cc: Jeff Vander Stoep Cc: Jerome Glisse Cc: "Joel Fernandes (Google)" Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kalesh Singh Cc: Kees Cook Cc: Luis Chamberlain Cc: Mauro Carvalho Chehab Cc: Mel Gorman Cc: Mike Rapoport Cc: Nitin Gupta Cc: Peter Xu Cc: Sebastian Andrzej Siewior Cc: Shaohua Li Cc: Stephen Smalley Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 10 +++++++++- include/uapi/linux/userfaultfd.h | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 000b457ad087..605599fde015 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -405,6 +405,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; + if ((vmf->flags & FAULT_FLAG_USER) == 0 && + ctx->flags & UFFD_USER_MODE_ONLY) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); + goto out; + } /* * If it's already released don't get it. This avoids to loop @@ -1965,10 +1972,11 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ + BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS); BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - if (flags & ~UFFD_SHARED_FCNTL_FLAGS) + if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY)) return -EINVAL; ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index e7e98bde221f..5f2d88212f7c 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -257,4 +257,13 @@ struct uffdio_writeprotect { __u64 mode; }; +/* + * Flags for the userfaultfd(2) system call itself. + */ + +/* + * Create a userfaultfd that can handle page faults only in user mode. + */ +#define UFFD_USER_MODE_ONLY 1 + #endif /* _LINUX_USERFAULTFD_H */ -- cgit v1.2.3 From d0d4730ac2e404a5b0da9a87ef38c73e51cb1664 Mon Sep 17 00:00:00 2001 From: Lokesh Gidra Date: Mon, 14 Dec 2020 19:13:54 -0800 Subject: userfaultfd: add user-mode only option to unprivileged_userfaultfd sysctl knob With this change, when the knob is set to 0, it allows unprivileged users to call userfaultfd, like when it is set to 1, but with the restriction that page faults from only user-mode can be handled. In this mode, an unprivileged user (without SYS_CAP_PTRACE capability) must pass UFFD_USER_MODE_ONLY to userfaultd or the API will fail with EPERM. This enables administrators to reduce the likelihood that an attacker with access to userfaultfd can delay faulting kernel code to widen timing windows for other exploits. The default value of this knob is changed to 0. This is required for correct functioning of pipe mutex. However, this will fail postcopy live migration, which will be unnoticeable to the VM guests. To avoid this, set 'vm.userfault = 1' in /sys/sysctl.conf. The main reason this change is desirable as in the short term is that the Android userland will behave as with the sysctl set to zero. So without this commit, any Linux binary using userfaultfd to manage its memory would behave differently if run within the Android userland. For more details, refer to Andrea's reply [1]. [1] https://lore.kernel.org/lkml/20200904033438.GI9411@redhat.com/ Link: https://lkml.kernel.org/r/20201120030411.2690816-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra Reviewed-by: Andrea Arcangeli Cc: Kees Cook Cc: Jonathan Corbet Cc: Peter Xu Cc: Sebastian Andrzej Siewior Cc: Alexander Viro Cc: Stephen Smalley Cc: Eric Biggers Cc: Daniel Colascione Cc: "Joel Fernandes (Google)" Cc: Kalesh Singh Cc: Suren Baghdasaryan Cc: Jeff Vander Stoep Cc: Cc: Mike Rapoport Cc: Shaohua Li Cc: Jerome Glisse Cc: Mauro Carvalho Chehab Cc: Johannes Weiner Cc: Mel Gorman Cc: Nitin Gupta Cc: Vlastimil Babka Cc: Iurii Zaikin Cc: Luis Chamberlain Cc: Daniel Colascione Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 15 ++++++++++----- fs/userfaultfd.c | 10 ++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index f455fa00c00f..d06a98b2a4e7 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -873,12 +873,17 @@ file-backed pages is less than the high watermark in a zone. unprivileged_userfaultfd ======================== -This flag controls whether unprivileged users can use the userfaultfd -system calls. Set this to 1 to allow unprivileged users to use the -userfaultfd system calls, or set this to 0 to restrict userfaultfd to only -privileged users (with SYS_CAP_PTRACE capability). +This flag controls the mode in which unprivileged users can use the +userfaultfd system calls. Set this to 0 to restrict unprivileged users +to handle page faults in user mode only. In this case, users without +SYS_CAP_PTRACE must pass UFFD_USER_MODE_ONLY in order for userfaultfd to +succeed. Prohibiting use of userfaultfd for handling faults from kernel +mode may make certain vulnerabilities more difficult to exploit. -The default value is 1. +Set this to 1 to allow unprivileged users to use the userfaultfd system +calls without any restrictions. + +The default value is 0. user_reserve_kbytes diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 605599fde015..894cc28142e7 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -28,7 +28,7 @@ #include #include -int sysctl_unprivileged_userfaultfd __read_mostly = 1; +int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; @@ -1966,8 +1966,14 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + if (!sysctl_unprivileged_userfaultfd && + (flags & UFFD_USER_MODE_ONLY) == 0 && + !capable(CAP_SYS_PTRACE)) { + printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " + "sysctl knob to 1 if kernel faults must be handled " + "without obtaining CAP_SYS_PTRACE capability\n"); return -EPERM; + } BUG_ON(!current->mm); -- cgit v1.2.3