From 7302e91f39a81a9c2efcf4bc5749d18128366945 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:03:58 -0800 Subject: mm/slab_common: use WARN() if cache still has objects on destroy Calling kmem_cache_destroy() while the cache still has objects allocated is a kernel bug, and will usually result in the entire cache being leaked. While the message in kmem_cache_destroy() resembles a warning, it is currently not implemented using a real WARN(). This is problematic for infrastructure testing the kernel, all of which rely on the specific format of WARN()s to pick up on bugs. Some 13 years ago this used to be a simple WARN_ON() in slub, but commit d629d8195793 ("slub: improve kmem_cache_destroy() error message") changed it into an open-coded warning to avoid confusion with a bug in slub itself. Instead, turn the open-coded warning into a real WARN() with the message preserved, so that test systems can actually identify these issues, and we get all the other benefits of using a normal WARN(). The warning message is extended with "when called from " to make it even clearer where the fault lies. For most configurations this is only a cosmetic change, however, note that WARN() here will now also respect panic_on_warn. Link: https://lkml.kernel.org/r/20211102170733.648216-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Vlastimil Babka Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index e5d080a93009..c6213f18eb3a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,8 +489,6 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int err; - if (unlikely(!s)) return; @@ -501,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - err = shutdown_cache(s); - if (err) { - pr_err("%s %s: Slab cache still has objects\n", - __func__, s->name); - dump_stack(); - } + WARN(shutdown_cache(s), + "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); -- cgit v1.2.3 From c29b5b3d33a61e122cb493917ba51c82bcac4121 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 14 Jan 2022 14:04:01 -0800 Subject: mm: slab: make slab iterator functions static There is no external users of slab_start/next/stop(), so make them static. And the memory.kmem.slabinfo is deprecated, which outputs nothing now, so move memcg_slab_show() into mm/memcontrol.c and rename it to mem_cgroup_slab_show to be consistent with other function names. Link: https://lkml.kernel.org/r/20211109133359.32881-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 ++++++++++++- mm/slab.h | 5 ----- mm/slab_common.c | 17 +++-------------- 3 files changed, 15 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ed5f2a0879d..3542237c833f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4845,6 +4845,17 @@ out_kfree: return ret; } +#if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) +static int mem_cgroup_slab_show(struct seq_file *m, void *p) +{ + /* + * Deprecated. + * Please, take a look at tools/cgroup/slabinfo.py . + */ + return 0; +} +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4945,7 +4956,7 @@ static struct cftype mem_cgroup_legacy_files[] = { (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)) { .name = "kmem.slabinfo", - .seq_show = memcg_slab_show, + .seq_show = mem_cgroup_slab_show, }, #endif { diff --git a/mm/slab.h b/mm/slab.h index 56ad7eea3ddf..053eefaf6cbd 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -575,11 +575,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif -void *slab_start(struct seq_file *m, loff_t *pos); -void *slab_next(struct seq_file *m, void *p, loff_t *pos); -void slab_stop(struct seq_file *m, void *p); -int memcg_slab_show(struct seq_file *m, void *p); - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) void dump_unreclaimable_slab(void); #else diff --git a/mm/slab_common.c b/mm/slab_common.c index c6213f18eb3a..b7c431819cdb 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1039,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m) seq_putc(m, '\n'); } -void *slab_start(struct seq_file *m, loff_t *pos) +static void *slab_start(struct seq_file *m, loff_t *pos) { mutex_lock(&slab_mutex); return seq_list_start(&slab_caches, *pos); } -void *slab_next(struct seq_file *m, void *p, loff_t *pos) +static void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); } -void slab_stop(struct seq_file *m, void *p) +static void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@ -1118,17 +1118,6 @@ void dump_unreclaimable_slab(void) mutex_unlock(&slab_mutex); } -#if defined(CONFIG_MEMCG_KMEM) -int memcg_slab_show(struct seq_file *m, void *p) -{ - /* - * Deprecated. - * Please, take a look at tools/cgroup/slabinfo.py . - */ - return 0; -} -#endif - /* * slabinfo_op - iterator that generates /proc/slabinfo * -- cgit v1.2.3 From ad1a3e15fcd3b8ba0f5f60f6a2fe3938274fdf65 Mon Sep 17 00:00:00 2001 From: Kuan-Ying Lee Date: Fri, 14 Jan 2022 14:04:04 -0800 Subject: kmemleak: fix kmemleak false positive report with HW tag-based kasan enable With HW tag-based kasan enable, We will get the warning when we free object whose address starts with 0xFF. It is because kmemleak rbtree stores tagged object and this freeing object's tag does not match with rbtree object. In the example below, kmemleak rbtree stores the tagged object in the kmalloc(), and kfree() gets the pointer with 0xFF tag. Call sequence: ptr = kmalloc(size, GFP_KERNEL); page = virt_to_page(ptr); offset = offset_in_page(ptr); kfree(page_address(page) + offset); ptr = kmalloc(size, GFP_KERNEL); A sequence like that may cause the warning as following: 1) Freeing unknown object: In kfree(), we will get free unknown object warning in kmemleak_free(). Because object(0xFx) in kmemleak rbtree and pointer(0xFF) in kfree() have different tag. 2) Overlap existing: When we allocate that object with the same hw-tag again, we will find the overlap in the kmemleak rbtree and kmemleak thread will be killed. kmemleak: Freeing unknown object at 0xffff000003f88000 CPU: 5 PID: 177 Comm: cat Not tainted 5.16.0-rc1-dirty #21 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1ac show_stack+0x1c/0x30 dump_stack_lvl+0x68/0x84 dump_stack+0x1c/0x38 kmemleak_free+0x6c/0x70 slab_free_freelist_hook+0x104/0x200 kmem_cache_free+0xa8/0x3d4 test_version_show+0x270/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 el0_svc+0x20/0x80 el0t_64_sync_handler+0x1a8/0x1b0 el0t_64_sync+0x1ac/0x1b0 ... kmemleak: Cannot insert 0xf2ff000003f88000 into the object search tree (overlaps existing) CPU: 5 PID: 178 Comm: cat Not tainted 5.16.0-rc1-dirty #21 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace+0x0/0x1ac show_stack+0x1c/0x30 dump_stack_lvl+0x68/0x84 dump_stack+0x1c/0x38 create_object.isra.0+0x2d8/0x2fc kmemleak_alloc+0x34/0x40 kmem_cache_alloc+0x23c/0x2f0 test_version_show+0x1fc/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 el0_svc+0x20/0x80 el0t_64_sync_handler+0x1a8/0x1b0 el0t_64_sync+0x1ac/0x1b0 kmemleak: Kernel memory leak detector disabled kmemleak: Object 0xf2ff000003f88000 (size 128): kmemleak: comm "cat", pid 177, jiffies 4294921177 kmemleak: min_count = 1 kmemleak: count = 0 kmemleak: flags = 0x1 kmemleak: checksum = 0 kmemleak: backtrace: kmem_cache_alloc+0x23c/0x2f0 test_version_show+0x1fc/0x3a0 module_attr_show+0x28/0x40 sysfs_kf_seq_show+0xb0/0x130 kernfs_seq_show+0x30/0x40 seq_read_iter+0x1bc/0x4b0 kernfs_fop_read_iter+0x144/0x1c0 generic_file_splice_read+0xd0/0x184 do_splice_to+0x90/0xe0 splice_direct_to_actor+0xb8/0x250 do_splice_direct+0x88/0xd4 do_sendfile+0x2b0/0x344 __arm64_sys_sendfile64+0x164/0x16c invoke_syscall+0x48/0x114 el0_svc_common.constprop.0+0x44/0xec do_el0_svc+0x74/0x90 kmemleak: Automatic memory scanning thread ended [akpm@linux-foundation.org: whitespace tweak] Link: https://lkml.kernel.org/r/20211118054426.4123-1-Kuan-Ying.Lee@mediatek.com Signed-off-by: Kuan-Ying Lee Reviewed-by: Catalin Marinas Cc: Doug Berger Cc: Mel Gorman Cc: Peter Zijlstra Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index b57383c17cf6..dc3758fdba68 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object) static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) { struct rb_node *rb = object_tree_root.rb_node; + unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); while (rb) { - struct kmemleak_object *object = - rb_entry(rb, struct kmemleak_object, rb_node); - if (ptr < object->pointer) + struct kmemleak_object *object; + unsigned long untagged_objp; + + object = rb_entry(rb, struct kmemleak_object, rb_node); + untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); + + if (untagged_ptr < untagged_objp) rb = object->rb_node.rb_left; - else if (object->pointer + object->size <= ptr) + else if (untagged_objp + object->size <= untagged_ptr) rb = object->rb_node.rb_right; - else if (object->pointer == ptr || alias) + else if (untagged_objp == untagged_ptr || alias) return object; else { kmemleak_warn("Found object by alias at 0x%08lx\n", @@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object, *parent; struct rb_node **link, *rb_parent; unsigned long untagged_ptr; + unsigned long untagged_objp; object = mem_pool_alloc(gfp); if (!object) { @@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, while (*link) { rb_parent = *link; parent = rb_entry(rb_parent, struct kmemleak_object, rb_node); - if (ptr + size <= parent->pointer) + untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer); + if (untagged_ptr + size <= untagged_objp) link = &parent->rb_node.rb_left; - else if (parent->pointer + parent->size <= ptr) + else if (untagged_objp + parent->size <= untagged_ptr) link = &parent->rb_node.rb_right; else { kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", -- cgit v1.2.3 From 60115fa54ad7b913b7cb5844e6b7ffeb842d55f2 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 14 Jan 2022 14:04:11 -0800 Subject: mm: defer kmemleak object creation of module_alloc() Yongqiang reports a kmemleak panic when module insmod/rmmod with KASAN enabled(without KASAN_VMALLOC) on x86[1]. When the module area allocates memory, it's kmemleak_object is created successfully, but the KASAN shadow memory of module allocation is not ready, so when kmemleak scan the module's pointer, it will panic due to no shadow memory with KASAN check. module_alloc __vmalloc_node_range kmemleak_vmalloc kmemleak_scan update_checksum kasan_module_alloc kmemleak_ignore Note, there is no problem if KASAN_VMALLOC enabled, the modules area entire shadow memory is preallocated. Thus, the bug only exits on ARCH which supports dynamic allocation of module area per module load, for now, only x86/arm64/s390 are involved. Add a VM_DEFER_KMEMLEAK flags, defer vmalloc'ed object register of kmemleak in module_alloc() to fix this issue. [1] https://lore.kernel.org/all/6d41e2b9-4692-5ec4-b1cd-cbe29ae89739@huawei.com/ [wangkefeng.wang@huawei.com: fix build] Link: https://lkml.kernel.org/r/20211125080307.27225-1-wangkefeng.wang@huawei.com [akpm@linux-foundation.org: simplify ifdefs, per Andrey] Link: https://lkml.kernel.org/r/CA+fCnZcnwJHUQq34VuRxpdoY6_XbJCDJ-jopksS5Eia4PijPzw@mail.gmail.com Link: https://lkml.kernel.org/r/20211124142034.192078-1-wangkefeng.wang@huawei.com Fixes: 793213a82de4 ("s390/kasan: dynamic shadow mem allocation for modules") Fixes: 39d114ddc682 ("arm64: add KASAN support") Fixes: bebf56a1b176 ("kasan: enable instrumentation of global variables") Signed-off-by: Kefeng Wang Reported-by: Yongqiang Liu Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Alexander Gordeev Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Alexander Potapenko Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/module.c | 4 ++-- arch/s390/kernel/module.c | 5 +++-- arch/x86/kernel/module.c | 7 ++++--- include/linux/kasan.h | 4 ++-- include/linux/vmalloc.h | 7 +++++++ mm/kasan/shadow.c | 9 +++++++-- mm/vmalloc.c | 3 ++- 7 files changed, 27 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index b5ec010c481f..309a27553c87 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) module_alloc_end = MODULES_END; p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, 0, + module_alloc_end, gfp_mask, PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b01ba460b7ca..d52d85367bf7 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -37,14 +37,15 @@ void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END, - GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + gfp_mask, PAGE_KERNEL_EXEC, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 169fb6f4cd2e..95fa745e310a 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -67,6 +67,7 @@ static unsigned long int get_module_load_offset(void) void *module_alloc(unsigned long size) { + gfp_t gfp_mask = GFP_KERNEL; void *p; if (PAGE_ALIGN(size) > MODULES_LEN) @@ -74,10 +75,10 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), - MODULES_END, GFP_KERNEL, - PAGE_KERNEL, 0, NUMA_NO_NODE, + MODULES_END, gfp_mask, + PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, __builtin_return_address(0)); - if (p && (kasan_module_alloc(p, size) < 0)) { + if (p && (kasan_module_alloc(p, size, gfp_mask) < 0)) { vfree(p); return NULL; } diff --git a/include/linux/kasan.h b/include/linux/kasan.h index d8783b682669..89c99e5e67de 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -474,12 +474,12 @@ static inline void kasan_populate_early_vm_area_shadow(void *start, * allocations with real shadow memory. With KASAN vmalloc, the special * case is unnecessary, as the work is handled in the generic case. */ -int kasan_module_alloc(void *addr, size_t size); +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask); void kasan_free_shadow(const struct vm_struct *vm); #else /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ -static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } +static inline int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* (CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS) && !CONFIG_KASAN_VMALLOC */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6e022cc712e6..880227b9f044 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,13 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ +#if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ + !defined(CONFIG_KASAN_VMALLOC) +#define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ +#else +#define VM_DEFER_KMEMLEAK 0 +#endif + /* * VM_KASAN is used slightly differently depending on CONFIG_KASAN_VMALLOC. * diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 4a4929b29a23..94136f84b449 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -498,7 +498,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, #else /* CONFIG_KASAN_VMALLOC */ -int kasan_module_alloc(void *addr, size_t size) +int kasan_module_alloc(void *addr, size_t size, gfp_t gfp_mask) { void *ret; size_t scaled_size; @@ -520,9 +520,14 @@ int kasan_module_alloc(void *addr, size_t size) __builtin_return_address(0)); if (ret) { + struct vm_struct *vm = find_vm_area(addr); __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; + vm->flags |= VM_KASAN; kmemleak_ignore(ret); + + if (vm->flags & VM_DEFER_KMEMLEAK) + kmemleak_vmalloc(vm, size, gfp_mask); + return 0; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d2a00ad4e1dd..bf3c2fe8f528 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3074,7 +3074,8 @@ again: clear_vm_uninitialized_flag(area); size = PAGE_ALIGN(size); - kmemleak_vmalloc(area, size, gfp_mask); + if (!(vm_flags & VM_DEFER_KMEMLEAK)) + kmemleak_vmalloc(area, size, gfp_mask); return addr; -- cgit v1.2.3 From 5b24eeef06701cca6852f1bf768248ccc912819b Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:15 -0800 Subject: mm/page_alloc: split prep_compound_page into head and tail subparts Patch series "mm, device-dax: Introduce compound pages in devmap", v7. This series converts device-dax to use compound pages, and moves away from the 'struct page per basepage on PMD/PUD' that is done today. Doing so 1) unlocks a few noticeable improvements on unpin_user_pages() and makes device-dax+altmap case 4x times faster in pinning (numbers below and in last patch) 2) as mentioned in various other threads it's one important step towards cleaning up ZONE_DEVICE refcounting. I've split the compound pages on devmap part from the rest based on recent discussions on devmap pending and future work planned[5][6]. There is consensus that device-dax should be using compound pages to represent its PMD/PUDs just like HugeTLB and THP, and that leads to less specialization of the dax parts. I will pursue the rest of the work in parallel once this part is merged, particular the GUP-{slow,fast} improvements [7] and the tail struct page deduplication memory savings part[8]. To summarize what the series does: Patch 1: Prepare hwpoisoning to work with dax compound pages. Patches 2-3: Split the current utility function of prep_compound_page() into head and tail and use those two helpers where appropriate to take advantage of caches being warm after __init_single_page(). This is used when initializing zone device when we bring up device-dax namespaces. Patches 4-10: Add devmap support for compound pages in device-dax. memmap_init_zone_device() initialize its metadata as compound pages, and it introduces a new devmap property known as vmemmap_shift which outlines how the vmemmap is structured (defaults to base pages as done today). The property describe the page order of the metadata essentially. While at it do a few cleanups in device-dax in patches 5-9. Finally enable device-dax usage of devmap @vmemmap_shift to a value based on its own @align property. @vmemmap_shift returns 0 by default (which is today's case of base pages in devmap, like fsdax or the others) and the usage of compound devmap is optional. Starting with device-dax (*not* fsdax) we enable it by default. There are a few pinning improvements particular on the unpinning case and altmap, as well as unpin_user_page_range_dirty_lock() being just as effective as THP/hugetlb[0] pages. $ gup_test -f /dev/dax1.0 -m 16384 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~71 ms -> put:~22 ms [altmap] (pin_user_pages_fast 2M pages) get:~524ms put:~525 ms -> get: ~127ms put:~71ms $ gup_test -f /dev/dax1.0 -m 129022 -r 10 -S -a -n 512 -w (pin_user_pages_fast 2M pages) put:~513 ms -> put:~188 ms [altmap with -m 127004] (pin_user_pages_fast 2M pages) get:~4.1 secs put:~4.12 secs -> get:~1sec put:~563ms Tested on x86 with 1Tb+ of pmem (alongside registering it with RDMA with and without altmap), alongside gup_test selftests with dynamic dax regions and static dax regions. Coupled with ndctl unit tests for dynamic dax devices that exercise all of this. Note, for dynamic dax regions I had to revert commit 8aa83e6395 ("x86/setup: Call early_reserve_memory() earlier"), it is a known issue that this commit broke efi_fake_mem=. This patch (of 11): Split the utility function prep_compound_page() into head and tail counterparts, and use them accordingly. This is in preparation for sharing the storage for compound page metadata. Link: https://lkml.kernel.org/r/20211202204422.26777-1-joao.m.martins@oracle.com Link: https://lkml.kernel.org/r/20211202204422.26777-3-joao.m.martins@oracle.com Signed-off-by: Joao Martins Acked-by: Mike Kravetz Reviewed-by: Dan Williams Reviewed-by: Muchun Song Cc: Vishal Verma Cc: Dave Jiang Cc: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jane Chu Cc: Jonathan Corbet Cc: Christoph Hellwig Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..20b9db0cf97c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -726,23 +726,33 @@ void free_compound_page(struct page *page) free_the_page(page, compound_order(page)); } +static void prep_compound_head(struct page *page, unsigned int order) +{ + set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + set_compound_order(page, order); + atomic_set(compound_mapcount_ptr(page), -1); + if (hpage_pincount_available(page)) + atomic_set(compound_pincount_ptr(page), 0); +} + +static void prep_compound_tail(struct page *head, int tail_idx) +{ + struct page *p = head + tail_idx; + + p->mapping = TAIL_MAPPING; + set_compound_head(p, head); +} + void prep_compound_page(struct page *page, unsigned int order) { int i; int nr_pages = 1 << order; __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - p->mapping = TAIL_MAPPING; - set_compound_head(p, page); - } + for (i = 1; i < nr_pages; i++) + prep_compound_tail(page, i); - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); - set_compound_order(page, order); - atomic_set(compound_mapcount_ptr(page), -1); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + prep_compound_head(page, order); } #ifdef CONFIG_DEBUG_PAGEALLOC -- cgit v1.2.3 From 46487e0095f895c25da9feae27dc06d2aa76793d Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:18 -0800 Subject: mm/page_alloc: refactor memmap_init_zone_device() page init Move struct page init to an helper function __init_zone_device_page(). This is in preparation for sharing the storage for compound page metadata. Link: https://lkml.kernel.org/r/20211202204422.26777-4-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 74 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20b9db0cf97c..23045a2a1339 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6572,6 +6572,46 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone } #ifdef CONFIG_ZONE_DEVICE +static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap) +{ + + __init_single_page(page, pfn, zone_idx, nid); + + /* + * Mark page reserved as it will need to wait for onlining + * phase for it to be fully associated with a zone. + * + * We can use the non-atomic __set_bit operation for setting + * the flag as we are still initializing the pages. + */ + __SetPageReserved(page); + + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. + */ + page->pgmap = pgmap; + page->zone_device_data = NULL; + + /* + * Mark the block movable so that blocks are reserved for + * movable at startup. This will force kernel allocations + * to reserve their blocks rather than leaking throughout + * the address space during boot when many long-lived + * kernel allocations are made. + * + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap + * because this is done early in section_activate() + */ + if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + cond_resched(); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6600,39 +6640,7 @@ void __ref memmap_init_zone_device(struct zone *zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone_idx, nid); - - /* - * Mark page reserved as it will need to wait for onlining - * phase for it to be fully associated with a zone. - * - * We can use the non-atomic __set_bit operation for setting - * the flag as we are still initializing the pages. - */ - __SetPageReserved(page); - - /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer - * and zone_device_data. It is a bug if a ZONE_DEVICE page is - * ever freed or placed on a driver-private list. - */ - page->pgmap = pgmap; - page->zone_device_data = NULL; - - /* - * Mark the block movable so that blocks are reserved for - * movable at startup. This will force kernel allocations - * to reserve their blocks rather than leaking throughout - * the address space during boot when many long-lived - * kernel allocations are made. - * - * Please note that MEMINIT_HOTPLUG path doesn't clear memmap - * because this is done early in section_activate() - */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - cond_resched(); - } + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); } pr_info("%s initialised %lu pages in %ums\n", __func__, -- cgit v1.2.3 From c4386bd8ee3a921c3c799b7197dc898ade76a453 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 14 Jan 2022 14:04:22 -0800 Subject: mm/memremap: add ZONE_DEVICE support for compound pages Add a new @vmemmap_shift property for struct dev_pagemap which specifies that a devmap is composed of a set of compound pages of order @vmemmap_shift, instead of base pages. When a compound page devmap is requested, all but the first page are initialised as tail pages instead of order-0 pages. For certain ZONE_DEVICE users like device-dax which have a fixed page size, this creates an opportunity to optimize GUP and GUP-fast walkers, treating it the same way as THP or hugetlb pages. Additionally, commit 7118fc2906e2 ("hugetlb: address ref count racing in prep_compound_gigantic_page") removed set_page_count() because the setting of page ref count to zero was redundant. devmap pages don't come from page allocator though and only head page refcount is used for compound pages, hence initialize tail page count to zero. Link: https://lkml.kernel.org/r/20211202204422.26777-5-joao.m.martins@oracle.com Signed-off-by: Joao Martins Reviewed-by: Dan Williams Cc: Christoph Hellwig Cc: Dave Jiang Cc: Jane Chu Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Naoya Horiguchi Cc: Vishal Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memremap.h | 11 +++++++++++ mm/memremap.c | 18 ++++++++++++------ mm/page_alloc.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 60 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c0e9d35889e8..61a6a0e27359 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -99,6 +99,11 @@ struct dev_pagemap_ops { * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior + * @vmemmap_shift: structural definition of how the vmemmap page metadata + * is populated, specifically the metadata page order. + * A zero value (default) uses base pages as the vmemmap metadata + * representation. A bigger value will set up compound struct pages + * of the requested order value. * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no @@ -114,6 +119,7 @@ struct dev_pagemap { struct completion done; enum memory_type type; unsigned int flags; + unsigned long vmemmap_shift; const struct dev_pagemap_ops *ops; void *owner; int nr_range; @@ -130,6 +136,11 @@ static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) return NULL; } +static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap) +{ + return 1 << pgmap->vmemmap_shift; +} + #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); diff --git a/mm/memremap.c b/mm/memremap.c index 5a66a71ab591..a2869d8519a2 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -102,15 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id) return (range->start + range_len(range)) >> PAGE_SHIFT; } -static unsigned long pfn_next(unsigned long pfn) +static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn) { - if (pfn % 1024 == 0) + if (pfn % (1024 << pgmap->vmemmap_shift)) cond_resched(); - return pfn + 1; + return pfn + pgmap_vmemmap_nr(pgmap); +} + +static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id) +{ + return (pfn_end(pgmap, range_id) - + pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift; } #define for_each_device_pfn(pfn, map, i) \ - for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn)) + for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \ + pfn = pfn_next(map, pfn)) static void dev_pagemap_kill(struct dev_pagemap *pgmap) { @@ -295,8 +302,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id) - - pfn_first(pgmap, range_id)); + percpu_ref_get_many(pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23045a2a1339..d59023a676ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6612,6 +6612,35 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, } } +static void __ref memmap_init_compound(struct page *head, + unsigned long head_pfn, + unsigned long zone_idx, int nid, + struct dev_pagemap *pgmap, + unsigned long nr_pages) +{ + unsigned long pfn, end_pfn = head_pfn + nr_pages; + unsigned int order = pgmap->vmemmap_shift; + + __SetPageHead(head); + for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + prep_compound_tail(head, pfn - head_pfn); + set_page_count(page, 0); + + /* + * The first tail page stores compound_mapcount_ptr() and + * compound_order() and the second tail page stores + * compound_pincount_ptr(). Call prep_compound_head() after + * the first and second tail pages have been initialized to + * not have the data overwritten. + */ + if (pfn == head_pfn + 2) + prep_compound_head(head, order); + } +} + void __ref memmap_init_zone_device(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, @@ -6620,6 +6649,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long pfn, end_pfn = start_pfn + nr_pages; struct pglist_data *pgdat = zone->zone_pgdat; struct vmem_altmap *altmap = pgmap_altmap(pgmap); + unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; @@ -6637,10 +6667,16 @@ void __ref memmap_init_zone_device(struct zone *zone, nr_pages = end_pfn - start_pfn; } - for (pfn = start_pfn; pfn < end_pfn; pfn++) { + for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) { struct page *page = pfn_to_page(pfn); __init_zone_device_page(page, pfn, zone_idx, nid, pgmap); + + if (pfns_per_compound == 1) + continue; + + memmap_init_compound(page, pfn, zone_idx, nid, pgmap, + pfns_per_compound); } pr_info("%s initialised %lu pages in %ums\n", __func__, -- cgit v1.2.3 From bed0a9b591492bb285ea88cd221e0412031396ca Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 14 Jan 2022 14:04:54 -0800 Subject: kasan: add ability to detect double-kmem_cache_destroy() Because mm/slab_common.c is not instrumented with software KASAN modes, it is not possible to detect use-after-free of the kmem_cache passed into kmem_cache_destroy(). In particular, because of the s->refcount-- and subsequent early return if non-zero, KASAN would never be able to see the double-free via kmem_cache_free(kmem_cache, s). To be able to detect a double-kmem_cache_destroy(), check accessibility of the kmem_cache, and in case of failure return early. While KASAN_HW_TAGS is able to detect such bugs, by checking accessibility and returning early we fail more gracefully and also avoid corrupting reused objects (where tags mismatch). A recent case of a double-kmem_cache_destroy() was detected by KFENCE: https://lkml.kernel.org/r/0000000000003f654905c168b09d@google.com, which was not detectable by software KASAN modes. Link: https://lkml.kernel.org/r/20211119142219.1519617-1-elver@google.com Signed-off-by: Marco Elver Acked-by: Vlastimil Babka Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index b7c431819cdb..f02c32bd05ab 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -489,7 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - if (unlikely(!s)) + if (unlikely(!s) || !kasan_check_byte(s)) return; cpus_read_lock(); -- cgit v1.2.3 From 26dca996ea7b1ac7008b6b6063fc88b849e3ac3e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 14 Jan 2022 14:05:01 -0800 Subject: kasan: fix quarantine conflicting with init_on_free KASAN's quarantine might save its metadata inside freed objects. As this happens after the memory is zeroed by the slab allocator when init_on_free is enabled, the memory coming out of quarantine is not properly zeroed. This causes lib/test_meminit.c tests to fail with Generic KASAN. Zero the metadata when the object is removed from quarantine. Link: https://lkml.kernel.org/r/2805da5df4b57138fdacd671f5d227d58950ba54.1640037083.git.andreyknvl@google.com Fixes: 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index d8ccff4c1275..47ed4fc33a29 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -132,11 +132,22 @@ static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) { void *object = qlink_to_object(qlink, cache); + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); unsigned long flags; if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags); + /* + * If init_on_free is enabled and KASAN's free metadata is stored in + * the object, zero the metadata. Otherwise, the object's memory will + * not be properly zeroed, as KASAN saves the metadata after the slab + * allocator zeroes the object. + */ + if (slab_want_init_on_free(cache) && + cache->kasan_info.free_meta_offset == 0) + memzero_explicit(meta, sizeof(*meta)); + /* * As the object now gets freed from the quarantine, assume that its * free track is no longer valid. -- cgit v1.2.3 From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:05:04 -0800 Subject: mm,fs: split dump_mapping() out from dump_page() dump_mapping() is a big chunk of dump_page(), and it'd be handy to be able to call it when we don't have a struct page. Split it out and move it to fs/inode.c. Take the opportunity to simplify some of the debug messages a little. Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/debug.c | 52 ++-------------------------------------------------- 3 files changed, 52 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/fs/inode.c b/fs/inode.c index 6b80a51129d5..980e7b7a5460 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); +void dump_mapping(const struct address_space *mapping) +{ + struct inode *host; + const struct address_space_operations *a_ops; + struct hlist_node *dentry_first; + struct dentry *dentry_ptr; + struct dentry dentry; + unsigned long ino; + + /* + * If mapping is an invalid pointer, we don't want to crash + * accessing it, so probe everything depending on it carefully. + */ + if (get_kernel_nofault(host, &mapping->host) || + get_kernel_nofault(a_ops, &mapping->a_ops)) { + pr_warn("invalid mapping:%px\n", mapping); + return; + } + + if (!host) { + pr_warn("aops:%ps\n", a_ops); + return; + } + + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { + pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); + return; + } + + if (!dentry_first) { + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + return; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (get_kernel_nofault(dentry, dentry_ptr)) { + pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + a_ops, ino, dentry_ptr); + return; + } + + /* + * if dentry is corrupted, the %pd handler may still crash, + * but it's unlikely that we reach here with a corrupt mapping + */ + pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); +} + void clear_inode(struct inode *inode) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5315fa68f751 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); +void dump_mapping(const struct address_space *); /* * Userspace may rely on the the inode number being non-zero. For example, glibc diff --git a/mm/debug.c b/mm/debug.c index a05a39ff8fe4..bc9ac87f0e08 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -112,56 +112,8 @@ static void __dump_page(struct page *page) type = "ksm "; else if (PageAnon(page)) type = "anon "; - else if (mapping) { - struct inode *host; - const struct address_space_operations *a_ops; - struct hlist_node *dentry_first; - struct dentry *dentry_ptr; - struct dentry dentry; - unsigned long ino; - - /* - * mapping can be invalid pointer and we don't want to crash - * accessing it, so probe everything depending on it carefully - */ - if (get_kernel_nofault(host, &mapping->host) || - get_kernel_nofault(a_ops, &mapping->a_ops)) { - pr_warn("failed to read mapping contents, not a valid kernel address?\n"); - goto out_mapping; - } - - if (!host) { - pr_warn("aops:%ps\n", a_ops); - goto out_mapping; - } - - if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || - get_kernel_nofault(ino, &host->i_ino)) { - pr_warn("aops:%ps with invalid host inode %px\n", - a_ops, host); - goto out_mapping; - } - - if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); - goto out_mapping; - } - - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", - a_ops, ino, dentry_ptr); - } else { - /* - * if dentry is corrupted, the %pd handler may still - * crash, but it's unlikely that we reach here with a - * corrupted struct page - */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, ino, &dentry); - } - } -out_mapping: + else if (mapping) + dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %pGp%s\n", type, &head->flags, -- cgit v1.2.3 From 236476180c0f5d308fb313d5570d0b067307884c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:05:07 -0800 Subject: mm/debug_vm_pgtable: update comments regarding migration swap entries Commit 4dd845b5a3e5 ("mm/swapops: rework swap entry manipulation code") had changed migtation entry related helpers. Just update debug_vm_pgatble() synced documentation to reflect those changes. Link: https://lkml.kernel.org/r/1641880417-24848-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 14 +++++++------- mm/debug_vm_pgtable.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index 552567d863b8..b3166c33db39 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -247,12 +247,12 @@ SWAP Page Table Helpers | __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | +---------------------------+--------------------------------------------------+ | is_migration_entry | Tests a migration (read or write) swapped entry | -+---------------------------+--------------------------------------------------+ -| is_write_migration_entry | Tests a write migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry_read | Converts into read migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry | Creates a migration swapped entry (read or write)| -+---------------------------+--------------------------------------------------+ ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..2a2b24e87877 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -888,8 +888,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) pr_debug("Validating swap migration\n"); /* - * make_migration_entry() expects given page to be - * locked, otherwise it stumbles upon a BUG_ON(). + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); swp = make_writable_migration_entry(page_to_pfn(page)); -- cgit v1.2.3 From 43b93121056c524e2af77d561900ea856d32029c Mon Sep 17 00:00:00 2001 From: chiminghao Date: Fri, 14 Jan 2022 14:05:10 -0800 Subject: mm/truncate.c: remove unneeded variable Return value directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/20211207083222.401594-1-chi.minghao@zte.com.cn Signed-off-by: chiminghao Reported-by: Zeal Robot Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index cc83a3f7c1ad..41b8249b3b4a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -205,7 +205,6 @@ static void truncate_cleanup_page(struct page *page) static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - int ret; if (page->mapping != mapping) return 0; @@ -213,9 +212,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - ret = remove_mapping(mapping, page); - - return ret; + return remove_mapping(mapping, page); } int truncate_inode_page(struct address_space *mapping, struct page *page) -- cgit v1.2.3 From 677b2a8c1f25db5b09c1ef5bf72faa39ea81d9cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Jan 2022 14:05:13 -0800 Subject: gup: avoid multiple user access locking/unlocking in fault_in_{read/write}able fault_in_readable() and fault_in_writeable() perform __get_user() and __put_user() in a loop, implying multiple user access locking/unlocking. To avoid that, use user access blocks. Link: https://lkml.kernel.org/r/720dcf79314acca1a78fae56d478cc851952149d.1637084492.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 2c51e9748a6a..be2a41feec7d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1672,21 +1672,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_write_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__put_user(0, uaddr) != 0)) - return size; + unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__put_user(0, uaddr) != 0)) - goto out; + unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: + user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; @@ -1771,21 +1772,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_read_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__get_user(c, uaddr) != 0)) - return size; + unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__get_user(c, uaddr) != 0)) - goto out; + unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: + user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); -- cgit v1.2.3 From 28b0ee3fb35047bd2bac57cc5a051b26bbd9b194 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Fri, 14 Jan 2022 14:05:16 -0800 Subject: mm/gup.c: stricter check on THP migration entry during follow_pmd_mask When BUG_ON check for THP migration entry, the existing code only check thp_migration_supported case, but not for !thp_migration_supported case. If !thp_migration_supported() and !pmd_present(), the original code may dead loop in theory. To make the BUG_ON check consistent, we need catch both cases. Move the BUG_ON check one step earlier, because if the bug happen we should know it instead of depend on FOLL_MIGRATION been used by caller. Because pmdval instead of *pmd is read by the is_pmd_migration_entry() check, the existing code don't help to avoid useless locking within pmd_migration_entry_wait(), so remove that check. Link: https://lkml.kernel.org/r/20211217062559.737063-1-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Reviewed-by: "Huang, Ying" Reviewed-by: Miaohe Lin Cc: Zi Yan Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index be2a41feec7d..f0af462ac1e2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -642,12 +642,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } retry: if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmdval)); - if (is_pmd_migration_entry(pmdval)) - pmd_migration_entry_wait(mm, pmd); + + pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because -- cgit v1.2.3 From a7605426666196c5a460dd3de6f8dac1d3c21f00 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 14 Jan 2022 14:05:19 -0800 Subject: mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [akpm@linux-foundation.org: coding style fixes] [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org [Fix invalid pointer dereference in shmem_read_mapping_page_gfp() with a slight different implementation from what Ajay Garg and Muchun Song proposed and reworked the error handling of shmem_write_begin() suggested by Linus] Link: https://lore.kernel.org/linux-mm/20211111084617.6746-1-ajaygargnsit@gmail.com/ Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Link: https://lkml.kernel.org/r/20211116193247.21102-1-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ajay Garg Cc: Muchun Song Cc: Andy Lavr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 +++++++++++--- mm/shmem.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ mm/userfaultfd.c | 5 +++++ 3 files changed, 61 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3a274468f193..5f8ad5527506 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..fb5152369926 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2457,6 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2468,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2554,6 +2567,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3093,7 +3112,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3121,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3751,6 +3778,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3761,7 +3795,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4169,9 +4203,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ac6f036298cd..0780c2a57ff1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret) -- cgit v1.2.3 From 62c9827cbb996c2c04f615ecd783ce28bcea894b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:05:23 -0800 Subject: shmem: fix a race between shmem_unused_huge_shrink and shmem_evict_inode Fix a data race in commit 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure"). Here are call traces causing race: Call Trace 1: shmem_unused_huge_shrink+0x3ae/0x410 ? __list_lru_walk_one.isra.5+0x33/0x160 super_cache_scan+0x17c/0x190 shrink_slab.part.55+0x1ef/0x3f0 shrink_node+0x10e/0x330 kswapd+0x380/0x740 kthread+0xfc/0x130 ? mem_cgroup_shrink_node+0x170/0x170 ? kthread_create_on_node+0x70/0x70 ret_from_fork+0x1f/0x30 Call Trace 2: shmem_evict_inode+0xd8/0x190 evict+0xbe/0x1c0 do_unlinkat+0x137/0x330 do_syscall_64+0x76/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 A simple explanation: Image there are 3 items in the local list (@list). In the first traversal, A is not deleted from @list. 1) A->B->C ^ | pos (leave) In the second traversal, B is deleted from @list. Concurrently, A is deleted from @list through shmem_evict_inode() since last reference counter of inode is dropped by other thread. Then the @list is corrupted. 2) A->B->C ^ ^ | | evict pos (drop) We should make sure the inode is either on the global list or deleted from any local list before iput(). Fixed by moving inodes back to global list before we put them. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211125064502.99983-1-ligang.bdlg@bytedance.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Gang Li Reviewed-by: Muchun Song Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index fb5152369926..8f940552c182 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -554,7 +554,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shmem_inode_info *info; struct page *page; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -569,7 +569,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -577,12 +576,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -602,7 +601,7 @@ next: inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); @@ -616,38 +615,44 @@ next: } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ if (!trylock_page(page)) { put_page(page); - goto leave; + goto move_back; } ret = split_huge_page(page); unlock_page(page); put_page(page); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } -- cgit v1.2.3 From 3795f46b83c66a2e4545460dec74c80b839faafe Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 14 Jan 2022 14:05:26 -0800 Subject: mm/frontswap.c: use non-atomic '__set_bit()' when possible The 'a' and 'b' bitmaps are local to this function, so no concurrent access can occur. So the non-atomic '__set_bit()' can be used to save a few cycles. Link: https://lkml.kernel.org/r/e52476da5cee57151745c5c3c934a69798dc6fa4.1638132190.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/frontswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/frontswap.c b/mm/frontswap.c index 130e301c5ac0..6bed12260dea 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -127,7 +127,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (!WARN_ON(!si->frontswap_map)) - set_bit(si->type, a); + __set_bit(si->type, a); } spin_unlock(&swap_lock); @@ -149,7 +149,7 @@ void frontswap_register_ops(struct frontswap_ops *ops) spin_lock(&swap_lock); plist_for_each_entry(si, &swap_active_head, list) { if (si->frontswap_map) - set_bit(si->type, b); + __set_bit(si->type, b); } spin_unlock(&swap_lock); -- cgit v1.2.3 From 17c17367758059930246dde937cc7da9b8f3549e Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 14 Jan 2022 14:05:29 -0800 Subject: mm: memcontrol: make cgroup_memory_nokmem static Commit 494c1dfe855e ("mm: memcg/slab: create a new set of kmalloc-cg- caches") makes cgroup_memory_nokmem global, however, it is unnecessary because there is already a function mem_cgroup_kmem_disabled() which exports it. Just make it static and replace it with mem_cgroup_kmem_disabled() in mm/slab_common.c. Link: https://lkml.kernel.org/r/20211109065418.21693-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Chris Down Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 ----- mm/memcontrol.c | 2 +- mm/slab_common.c | 2 +- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 3b79a5c9427a..bdac62e1eca7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -157,11 +157,6 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason */ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); -/* - * in mm/memcontrol.c: - */ -extern bool cgroup_memory_nokmem; - /* * in mm/page_alloc.c */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3542237c833f..bfe9bdec192b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -84,7 +84,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP diff --git a/mm/slab_common.c b/mm/slab_common.c index f02c32bd05ab..1f75bd4e95d6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -844,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) if (type == KMALLOC_RECLAIM) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { - if (cgroup_memory_nokmem) { + if (mem_cgroup_kmem_disabled()) { kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx]; return; } -- cgit v1.2.3 From 46a53371f3fd9bf873fdd9c4df75b1cd86df1098 Mon Sep 17 00:00:00 2001 From: Donghai Qiao Date: Fri, 14 Jan 2022 14:05:32 -0800 Subject: mm/page_counter: remove an incorrect call to propagate_protected_usage() propagate_protected_usage() is called to propagate the usage change in the page_counter structure. But there is a call to this function from page_counter_try_charge() when there is actually no usage change. Hence this call should be removed. Link: https://lkml.kernel.org/r/20211118181125.3918222-1-dqiao@redhat.com Signed-off-by: Donghai Qiao Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_counter.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_counter.c b/mm/page_counter.c index 7d83641eb86b..eb156ff5d603 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_protected_usage(c, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt which is only used -- cgit v1.2.3 From b6bf9abb0aa44e53ffe9c1e6e1d32568f5b25e4a Mon Sep 17 00:00:00 2001 From: Dan Schatzberg Date: Fri, 14 Jan 2022 14:05:35 -0800 Subject: mm/memcg: add oom_group_kill memory event Our container agent wants to know when a container exits if it was OOM killed or not to report to the user. We use memory.oom.group = 1 to ensure that OOM kills within the container's cgroup kill everything. Existing memory.events are insufficient for knowing if this triggered: 1) Our current approach reads memory.events oom_kill and reports the container was killed if the value is non-zero. This is erroneous in some cases where containers create their children cgroups with memory.oom.group=1 as such OOM kills will get counted against the container cgroup's oom_kill counter despite not actually OOM killing the entire container. 2) Reading memory.events.local will fail to identify OOM kills in leaf cgroups (that don't set memory.oom.group) within the container cgroup. This patch adds a new oom_group_kill event when memory.oom.group triggers to allow userspace to cleanly identify when an entire cgroup is oom killed. [schatzberg.dan@gmail.com: changes from Johannes and Chris] Link: https://lkml.kernel.org/r/20211213162511.2492267-1-schatzberg.dan@gmail.com Link: https://lkml.kernel.org/r/20211203162426.3375036-1-schatzberg.dan@gmail.com Signed-off-by: Dan Schatzberg Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Acked-by: Chris Down Reviewed-by: Shakeel Butt Acked-by: Michal Hocko Cc: Tejun Heo Cc: Zefan Li Cc: Jonathan Corbet Cc: Vladimir Davydov Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Alex Shi Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 3 +++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 2 ++ mm/oom_kill.c | 1 + 4 files changed, 7 insertions(+) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 2aeb7ae8b393..8269bfa240f4 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1268,6 +1268,9 @@ PAGE_SIZE multiple when read back. The number of processes belonging to this cgroup killed by any kind of OOM killer. + oom_group_kill + The number of times a group OOM has occurred. + memory.events.local Similar to memory.events but the fields in the file are local to the cgroup i.e. not hierarchical. The file modified event diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0c5c403f4be6..951f24f42147 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -42,6 +42,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_OOM_GROUP_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bfe9bdec192b..2d39d58baccf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6318,6 +6318,8 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_KILL])); + seq_printf(m, "oom_group_kill %lu\n", + atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); } static int memory_events_show(struct seq_file *m, void *v) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1ddabefcfb5a..e52ce0b1465d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -994,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * If necessary, kill all tasks in the selected memory cgroup. */ if (oom_group) { + memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL); mem_cgroup_print_oom_group(oom_group); mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, (void *)message); -- cgit v1.2.3 From 5b3be698a872c490dbed524f3e2463701ab21339 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:39 -0800 Subject: memcg: better bounds on the memcg stats updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 11192d9c124d ("memcg: flush stats only if updated") added tracking of memcg stats updates which is used by the readers to flush only if the updates are over a certain threshold. However each individual update can correspond to a large value change for a given stat. For example adding or removing a hugepage to an LRU changes the stat by thp_nr_pages (512 on x86_64). Treating the update related to THP as one can keep the stat off, in theory, by (thp_nr_pages * nr_cpus * CHARGE_BATCH) before flush. To handle such scenarios, this patch adds consideration of the stat update value as well instead of just the update event. In addition let the asyn flusher unconditionally flush the stats to put time limit on the stats skew and hopefully a lot less readers would need to flush. Link: https://lkml.kernel.org/r/20211118065350.697046-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: "Michal Koutný" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d39d58baccf..aa2a15298636 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -629,11 +629,17 @@ static DEFINE_SPINLOCK(stats_flush_lock); static DEFINE_PER_CPU(unsigned int, stats_updates); static atomic_t stats_flush_threshold = ATOMIC_INIT(0); -static inline void memcg_rstat_updated(struct mem_cgroup *memcg) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + unsigned int x; + cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id()); - if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH)) - atomic_inc(&stats_flush_threshold); + + x = __this_cpu_add_return(stats_updates, abs(val)); + if (x > MEMCG_CHARGE_BATCH) { + atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold); + __this_cpu_write(stats_updates, 0); + } } static void __mem_cgroup_flush_stats(void) @@ -656,7 +662,7 @@ void mem_cgroup_flush_stats(void) static void flush_memcg_stats_dwork(struct work_struct *w) { - mem_cgroup_flush_stats(); + __mem_cgroup_flush_stats(); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); } @@ -672,7 +678,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -705,7 +711,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, val); } /** @@ -789,7 +795,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, return; __this_cpu_add(memcg->vmstats_percpu->events[idx], count); - memcg_rstat_updated(memcg); + memcg_rstat_updated(memcg, count); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) -- cgit v1.2.3 From 06b2c3b08ce134c9555d91a1cf15cd03646cc287 Mon Sep 17 00:00:00 2001 From: Wang Weiyang Date: Fri, 14 Jan 2022 14:05:42 -0800 Subject: mm/memcg: use struct_size() helper in kzalloc() Make use of the struct_size() helper instead of an open-coded version, in order to avoid any potential type mistakes or integer overflows that, in the worst scenario, could lead to heap overflows. Link: https://github.com/KSPP/linux/issues/160 Link: https://lkml.kernel.org/r/20211216022024.127375-1-wangweiyang2@huawei.com Signed-off-by: Wang Weiyang Reviewed-by: Muchun Song Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index aa2a15298636..88e1be912aa7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5122,15 +5122,11 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *memcg; - unsigned int size; int node; int __maybe_unused i; long error = -ENOMEM; - size = sizeof(struct mem_cgroup); - size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); + memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL); if (!memcg) return ERR_PTR(error); -- cgit v1.2.3 From 4e5aa1f4c2b489bc6f3ab5ca54747b18a847289d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 14 Jan 2022 14:05:45 -0800 Subject: memcg: add per-memcg vmalloc stat The kvmalloc* allocation functions can fallback to vmalloc allocations and more often on long running machines. In addition the kernel does have __GFP_ACCOUNT kvmalloc* calls. So, often on long running machines, the memory.stat does not tell the complete picture which type of memory is charged to the memcg. So add a per-memcg vmalloc stat. [shakeelb@google.com: page_memcg() within rcu lock, per Muchun] Link: https://lkml.kernel.org/r/20211222052457.1960701-1-shakeelb@google.com [akpm@linux-foundation.org: remove cast, per Muchun] [shakeelb@google.com: remove area->page[0] checks and move to page by page accounting per Michal] Link: https://lkml.kernel.org/r/20220104222341.3972772-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20211221215336.1922823-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Muchun Song Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 3 +++ include/linux/memcontrol.h | 21 +++++++++++++++++++++ mm/memcontrol.c | 1 + mm/vmalloc.c | 13 +++++++++++-- 4 files changed, 36 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 8269bfa240f4..4f400b03dddf 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1314,6 +1314,9 @@ PAGE_SIZE multiple when read back. sock (npn) Amount of memory used in network transmission buffers + vmalloc (npn) + Amount of memory used for vmap backed memory. + shmem Amount of cached filesystem data that is swap-backed, such as tmpfs, shm segments, shared anonymous mmap()s diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 951f24f42147..0131e5574c88 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -33,6 +33,7 @@ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, + MEMCG_VMALLOC, MEMCG_NR_STAT, }; @@ -992,6 +993,21 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, local_irq_restore(flags); } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + rcu_read_lock(); + memcg = page_memcg(page); + if (memcg) + mod_memcg_state(memcg, idx, val); + rcu_read_unlock(); +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return READ_ONCE(memcg->vmstats.state[idx]); @@ -1447,6 +1463,11 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, { } +static inline void mod_memcg_page_state(struct page *page, + int idx, int val) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 88e1be912aa7..c9ddd02dc5de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1375,6 +1375,7 @@ static const struct memory_stat memory_stats[] = { { "pagetables", NR_PAGETABLE }, { "percpu", MEMCG_PERCPU_B }, { "sock", MEMCG_SOCK }, + { "vmalloc", MEMCG_VMALLOC }, { "shmem", NR_SHMEM }, { "file_mapped", NR_FILE_MAPPED }, { "file_dirty", NR_FILE_DIRTY }, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf3c2fe8f528..80c6de4c425f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -2623,12 +2624,13 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { unsigned int page_order = vm_area_page_order(area); - int i; + int i, step = 1U << page_order; - for (i = 0; i < area->nr_pages; i += 1U << page_order) { + for (i = 0; i < area->nr_pages; i += step) { struct page *page = area->pages[i]; BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -step); __free_pages(page, page_order); cond_resched(); } @@ -2955,6 +2957,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + int i, step = 1U << page_order; + + for (i = 0; i < area->nr_pages; i += step) + mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, + step); + } /* * If not enough pages were obtained to accomplish an -- cgit v1.2.3 From ac1e9acc5acf0b41d54de6a4c45471644f8b97ff Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:55 -0800 Subject: mm: rearrange madvise code to allow for reuse Patch series "mm: rearrange madvise code to allow for reuse", v11. Avoid performance regression of the new anon vma name field refcounting it. I checked the image sizes with allnoconfig builds: unpatched Linus' ToT text data bss dec hex filename 1324759 32 73928 1398719 1557bf vmlinux After the first patch is applied (madvise refactoring) text data bss dec hex filename 1322346 32 73928 1396306 154e52 vmlinux >>> 2413 bytes decrease vs ToT <<< After all patches applied with CONFIG_ANON_VMA_NAME=n text data bss dec hex filename 1322337 32 73928 1396297 154e49 vmlinux >>> 2422 bytes decrease vs ToT <<< After all patches applied with CONFIG_ANON_VMA_NAME=y text data bss dec hex filename 1325228 32 73928 1399188 155994 vmlinux >>> 469 bytes increase vs ToT <<< This patch (of 3): Refactor the madvise syscall to allow for parts of it to be reused by a prctl syscall that affects vmas. Move the code that walks vmas in a virtual address range into a function that takes a function pointer as a parameter. The only caller for now is sys_madvise, which uses it to call madvise_vma_behavior on each vma, but the next patch will add an additional caller. Move handling all vma behaviors inside madvise_behavior, and rename it to madvise_vma_behavior. Move the code that updates the flags on a vma, including splitting or merging the vma as necessary, into a new function called madvise_update_vma. The next patch will add support for updating a new anon_name field as well. Link: https://lkml.kernel.org/r/20211019215511.3771969-1-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Cc: Pekka Enberg Cc: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Oleg Nesterov Cc: "Eric W. Biederman" Cc: Jan Glauber Cc: John Stultz Cc: Rob Landley Cc: Cyrill Gorcunov Cc: Kees Cook Cc: "Serge E. Hallyn" Cc: David Rientjes Cc: Al Viro Cc: Hugh Dickins Cc: Mel Gorman Cc: Shaohua Li Cc: Johannes Weiner Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 338 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 178 insertions(+), 160 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 8c927202bbe6..4b9c5509990c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -63,76 +63,20 @@ static int madvise_need_mmap_write(int behavior) } /* - * We can potentially split a vm area into separate - * areas, each area with its own behavior. + * Update the vm_flags on region of a vma, splitting it or merging it as + * necessary. Must be called with mmap_sem held for writing; */ -static long madvise_behavior(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) +static int madvise_update_vma(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long new_flags) { struct mm_struct *mm = vma->vm_mm; - int error = 0; + int error; pgoff_t pgoff; - unsigned long new_flags = vma->vm_flags; - - switch (behavior) { - case MADV_NORMAL: - new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; - break; - case MADV_SEQUENTIAL: - new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; - break; - case MADV_RANDOM: - new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; - break; - case MADV_DONTFORK: - new_flags |= VM_DONTCOPY; - break; - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTCOPY; - break; - case MADV_WIPEONFORK: - /* MADV_WIPEONFORK is only supported on anonymous memory. */ - if (vma->vm_file || vma->vm_flags & VM_SHARED) { - error = -EINVAL; - goto out; - } - new_flags |= VM_WIPEONFORK; - break; - case MADV_KEEPONFORK: - new_flags &= ~VM_WIPEONFORK; - break; - case MADV_DONTDUMP: - new_flags |= VM_DONTDUMP; - break; - case MADV_DODUMP: - if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) { - error = -EINVAL; - goto out; - } - new_flags &= ~VM_DONTDUMP; - break; - case MADV_MERGEABLE: - case MADV_UNMERGEABLE: - error = ksm_madvise(vma, start, end, behavior, &new_flags); - if (error) - goto out_convert_errno; - break; - case MADV_HUGEPAGE: - case MADV_NOHUGEPAGE: - error = hugepage_madvise(vma, &new_flags, behavior); - if (error) - goto out_convert_errno; - break; - } if (new_flags == vma->vm_flags) { *prev = vma; - goto out; + return 0; } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -147,23 +91,19 @@ static long madvise_behavior(struct vm_area_struct *vma, *prev = vma; if (start != vma->vm_start) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, start, 1); if (error) - goto out_convert_errno; + return error; } if (end != vma->vm_end) { - if (unlikely(mm->map_count >= sysctl_max_map_count)) { - error = -ENOMEM; - goto out; - } + if (unlikely(mm->map_count >= sysctl_max_map_count)) + return -ENOMEM; error = __split_vma(mm, vma, end, 0); if (error) - goto out_convert_errno; + return error; } success: @@ -172,15 +112,7 @@ success: */ vma->vm_flags = new_flags; -out_convert_errno: - /* - * madvise() returns EAGAIN if kernel resources, such as - * slab, are temporarily unavailable. - */ - if (error == -ENOMEM) - error = -EAGAIN; -out: - return error; + return 0; } #ifdef CONFIG_SWAP @@ -930,6 +862,94 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +/* + * Apply an madvise behavior to a region of a vma. madvise_update_vma + * will handle splitting a vm area into separate areas, each area with its own + * behavior. + */ +static int madvise_vma_behavior(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long behavior) +{ + int error; + unsigned long new_flags = vma->vm_flags; + + switch (behavior) { + case MADV_REMOVE: + return madvise_remove(vma, prev, start, end); + case MADV_WILLNEED: + return madvise_willneed(vma, prev, start, end); + case MADV_COLD: + return madvise_cold(vma, prev, start, end); + case MADV_PAGEOUT: + return madvise_pageout(vma, prev, start, end); + case MADV_FREE: + case MADV_DONTNEED: + return madvise_dontneed_free(vma, prev, start, end, behavior); + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + return madvise_populate(vma, prev, start, end, behavior); + case MADV_NORMAL: + new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; + break; + case MADV_SEQUENTIAL: + new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; + break; + case MADV_RANDOM: + new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; + break; + case MADV_DONTFORK: + new_flags |= VM_DONTCOPY; + break; + case MADV_DOFORK: + if (vma->vm_flags & VM_IO) + return -EINVAL; + new_flags &= ~VM_DONTCOPY; + break; + case MADV_WIPEONFORK: + /* MADV_WIPEONFORK is only supported on anonymous memory. */ + if (vma->vm_file || vma->vm_flags & VM_SHARED) + return -EINVAL; + new_flags |= VM_WIPEONFORK; + break; + case MADV_KEEPONFORK: + new_flags &= ~VM_WIPEONFORK; + break; + case MADV_DONTDUMP: + new_flags |= VM_DONTDUMP; + break; + case MADV_DODUMP: + if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) + return -EINVAL; + new_flags &= ~VM_DONTDUMP; + break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + error = hugepage_madvise(vma, &new_flags, behavior); + if (error) + goto out; + break; + } + + error = madvise_update_vma(vma, prev, start, end, new_flags); + +out: + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + #ifdef CONFIG_MEMORY_FAILURE /* * Error injection support for memory error handling. @@ -978,30 +998,6 @@ static int madvise_inject_error(int behavior, } #endif -static long -madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, int behavior) -{ - switch (behavior) { - case MADV_REMOVE: - return madvise_remove(vma, prev, start, end); - case MADV_WILLNEED: - return madvise_willneed(vma, prev, start, end); - case MADV_COLD: - return madvise_cold(vma, prev, start, end); - case MADV_PAGEOUT: - return madvise_pageout(vma, prev, start, end); - case MADV_FREE: - case MADV_DONTNEED: - return madvise_dontneed_free(vma, prev, start, end, behavior); - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - return madvise_populate(vma, prev, start, end, behavior); - default: - return madvise_behavior(vma, prev, start, end, behavior); - } -} - static bool madvise_behavior_valid(int behavior) { @@ -1055,6 +1051,73 @@ process_madvise_behavior_valid(int behavior) } } +/* + * Walk the vmas in range [start,end), and call the visit function on each one. + * The visit function will get start and end parameters that cover the overlap + * between the current vma and the original range. Any unmapped regions in the + * original range will result in this function returning -ENOMEM while still + * calling the visit function on all of the existing vmas in the range. + * Must be called with the mmap_lock held for reading or writing. + */ +static +int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long arg, + int (*visit)(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, + unsigned long end, unsigned long arg)) +{ + struct vm_area_struct *vma; + struct vm_area_struct *prev; + unsigned long tmp; + int unmapped_error = 0; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - different from the way of handling in mlock etc. + */ + vma = find_vma_prev(mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + int error; + + /* Still start < end. */ + if (!vma) + return -ENOMEM; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + break; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = visit(vma, &prev, start, tmp, arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + if (start >= end) + break; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_lock */ + vma = find_vma(mm, start); + } + + return unmapped_error; +} + /* * The madvise(2) system call. * @@ -1127,10 +1190,8 @@ process_madvise_behavior_valid(int behavior) */ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) { - unsigned long end, tmp; - struct vm_area_struct *vma, *prev; - int unmapped_error = 0; - int error = -EINVAL; + unsigned long end; + int error; int write; size_t len; struct blk_plug plug; @@ -1138,23 +1199,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh start = untagged_addr(start); if (!madvise_behavior_valid(behavior)) - return error; + return -EINVAL; if (!PAGE_ALIGNED(start)) - return error; + return -EINVAL; len = PAGE_ALIGN(len_in); /* Check to see whether len was rounded up from small -ve to zero */ if (len_in && !len) - return error; + return -EINVAL; end = start + len; if (end < start) - return error; + return -EINVAL; - error = 0; if (end == start) - return error; + return 0; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -1169,51 +1229,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh mmap_read_lock(mm); } - /* - * If the interval [start,end) covers some unmapped address - * ranges, just ignore them, but return -ENOMEM at the end. - * - different from the way of handling in mlock etc. - */ - vma = find_vma_prev(mm, start, &prev); - if (vma && start > vma->vm_start) - prev = vma; - blk_start_plug(&plug); - for (;;) { - /* Still start < end. */ - error = -ENOMEM; - if (!vma) - goto out; - - /* Here start < (end|vma->vm_end). */ - if (start < vma->vm_start) { - unmapped_error = -ENOMEM; - start = vma->vm_start; - if (start >= end) - goto out; - } - - /* Here vma->vm_start <= start < (end|vma->vm_end) */ - tmp = vma->vm_end; - if (end < tmp) - tmp = end; - - /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ - error = madvise_vma(vma, &prev, start, tmp, behavior); - if (error) - goto out; - start = tmp; - if (prev && start < prev->vm_end) - start = prev->vm_end; - error = unmapped_error; - if (start >= end) - goto out; - if (prev) - vma = prev->vm_next; - else /* madvise_remove dropped mmap_lock */ - vma = find_vma(mm, start); - } -out: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); blk_finish_plug(&plug); if (write) mmap_write_unlock(mm); -- cgit v1.2.3 From 9a10064f5625d5572c3626c1516e0bebc6c9fe9b Mon Sep 17 00:00:00 2001 From: Colin Cross Date: Fri, 14 Jan 2022 14:05:59 -0800 Subject: mm: add a field to store names for private anonymous memory In many userspace applications, and especially in VM based applications like Android uses heavily, there are multiple different allocators in use. At a minimum there is libc malloc and the stack, and in many cases there are libc malloc, the stack, direct syscalls to mmap anonymous memory, and multiple VM heaps (one for small objects, one for big objects, etc.). Each of these layers usually has its own tools to inspect its usage; malloc by compiling a debug version, the VM through heap inspection tools, and for direct syscalls there is usually no way to track them. On Android we heavily use a set of tools that use an extended version of the logic covered in Documentation/vm/pagemap.txt to walk all pages mapped in userspace and slice their usage by process, shared (COW) vs. unique mappings, backing, etc. This can account for real physical memory usage even in cases like fork without exec (which Android uses heavily to share as many private COW pages as possible between processes), Kernel SamePage Merging, and clean zero pages. It produces a measurement of the pages that only exist in that process (USS, for unique), and a measurement of the physical memory usage of that process with the cost of shared pages being evenly split between processes that share them (PSS). If all anonymous memory is indistinguishable then figuring out the real physical memory usage (PSS) of each heap requires either a pagemap walking tool that can understand the heap debugging of every layer, or for every layer's heap debugging tools to implement the pagemap walking logic, in which case it is hard to get a consistent view of memory across the whole system. Tracking the information in userspace leads to all sorts of problems. It either needs to be stored inside the process, which means every process has to have an API to export its current heap information upon request, or it has to be stored externally in a filesystem that somebody needs to clean up on crashes. It needs to be readable while the process is still running, so it has to have some sort of synchronization with every layer of userspace. Efficiently tracking the ranges requires reimplementing something like the kernel vma trees, and linking to it from every layer of userspace. It requires more memory, more syscalls, more runtime cost, and more complexity to separately track regions that the kernel is already tracking. This patch adds a field to /proc/pid/maps and /proc/pid/smaps to show a userspace-provided name for anonymous vmas. The names of named anonymous vmas are shown in /proc/pid/maps and /proc/pid/smaps as [anon:]. Userspace can set the name for a region of memory by calling prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, start, len, (unsigned long)name) Setting the name to NULL clears it. The name length limit is 80 bytes including NUL-terminator and is checked to contain only printable ascii characters (including space), except '[',']','\','$' and '`'. Ascii strings are being used to have a descriptive identifiers for vmas, which can be understood by the users reading /proc/pid/maps or /proc/pid/smaps. Names can be standardized for a given system and they can include some variable parts such as the name of the allocator or a library, tid of the thread using it, etc. The name is stored in a pointer in the shared union in vm_area_struct that points to a null terminated string. Anonymous vmas with the same name (equivalent strings) and are otherwise mergeable will be merged. The name pointers are not shared between vmas even if they contain the same name. The name pointer is stored in a union with fields that are only used on file-backed mappings, so it does not increase memory usage. CONFIG_ANON_VMA_NAME kernel configuration is introduced to enable this feature. It keeps the feature disabled by default to prevent any additional memory overhead and to avoid confusing procfs parsers on systems which are not ready to support named anonymous vmas. The patch is based on the original patch developed by Colin Cross, more specifically on its latest version [1] posted upstream by Sumit Semwal. It used a userspace pointer to store vma names. In that design, name pointers could be shared between vmas. However during the last upstreaming attempt, Kees Cook raised concerns [2] about this approach and suggested to copy the name into kernel memory space, perform validity checks [3] and store as a string referenced from vm_area_struct. One big concern is about fork() performance which would need to strdup anonymous vma names. Dave Hansen suggested experimenting with worst-case scenario of forking a process with 64k vmas having longest possible names [4]. I ran this experiment on an ARM64 Android device and recorded a worst-case regression of almost 40% when forking such a process. This regression is addressed in the followup patch which replaces the pointer to a name with a refcounted structure that allows sharing the name pointer between vmas of the same name. Instead of duplicating the string during fork() or when splitting a vma it increments the refcount. [1] https://lore.kernel.org/linux-mm/20200901161459.11772-4-sumit.semwal@linaro.org/ [2] https://lore.kernel.org/linux-mm/202009031031.D32EF57ED@keescook/ [3] https://lore.kernel.org/linux-mm/202009031022.3834F692@keescook/ [4] https://lore.kernel.org/linux-mm/5d0358ab-8c47-2f5f-8e43-23b89d6a8e95@intel.com/ Changes for prctl(2) manual page (in the options section): PR_SET_VMA Sets an attribute specified in arg2 for virtual memory areas starting from the address specified in arg3 and spanning the size specified in arg4. arg5 specifies the value of the attribute to be set. Note that assigning an attribute to a virtual memory area might prevent it from being merged with adjacent virtual memory areas due to the difference in that attribute's value. Currently, arg2 must be one of: PR_SET_VMA_ANON_NAME Set a name for anonymous virtual memory areas. arg5 should be a pointer to a null-terminated string containing the name. The name length including null byte cannot exceed 80 bytes. If arg5 is NULL, the name of the appropriate anonymous virtual memory areas will be reset. The name can contain only printable ascii characters (including space), except '[',']','\','$' and '`'. This feature is available only if the kernel is built with the CONFIG_ANON_VMA_NAME option enabled. [surenb@google.com: docs: proc.rst: /proc/PID/maps: fix malformed table] Link: https://lkml.kernel.org/r/20211123185928.2513763-1-surenb@google.com [surenb: rebased over v5.15-rc6, replaced userpointer with a kernel copy, added input sanitization and CONFIG_ANON_VMA_NAME config. The bulk of the work here was done by Colin Cross, therefore, with his permission, keeping him as the author] Link: https://lkml.kernel.org/r/20211019215511.3771969-2-surenb@google.com Signed-off-by: Colin Cross Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Stephen Rothwell Cc: Al Viro Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 6 +- fs/proc/task_mmu.c | 12 +++- fs/userfaultfd.c | 7 +- include/linux/mm.h | 13 +++- include/linux/mm_types.h | 64 ++++++++++++++++-- include/uapi/linux/prctl.h | 3 + kernel/fork.c | 2 + kernel/sys.c | 63 ++++++++++++++++++ mm/Kconfig | 14 ++++ mm/madvise.c | 129 +++++++++++++++++++++++++++++++++++-- mm/mempolicy.c | 3 +- mm/mlock.c | 2 +- mm/mmap.c | 38 ++++++----- mm/mprotect.c | 2 +- 14 files changed, 324 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8d7f141c6fc7..061744c436d9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..e6998652fd67 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -308,6 +308,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +321,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..5b2af7b82776 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -877,7 +877,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1436,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1614,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index a7e4a9e7d807..7000442984b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2658,7 +2658,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3391,5 +3391,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3a6e6209600..799e2ee626b2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -426,11 +426,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + char *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -875,4 +883,52 @@ typedef struct { unsigned long val; } swp_entry_t; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 3244cc56b697..4cf20b5f2da3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,12 +365,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504..2450a9f33cb0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2261,6 +2261,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2530,6 +2590,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/madvise.c b/mm/madvise.c index 4b9c5509990c..413bbc6e40a0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -62,19 +63,84 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return; + + kfree(vma->anon_name); + vma->anon_name = NULL; +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + if (vma->anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, vma->anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = kstrdup(name, GFP_KERNEL); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags) + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; return 0; } @@ -82,7 +148,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -111,6 +177,11 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } return 0; } @@ -938,7 +1009,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } - error = madvise_update_vma(vma, prev, start, end, new_flags); + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); out: /* @@ -1118,6 +1190,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..679f47b3a079 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..85edb0011453 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1029,7 +1029,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1048,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1082,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1104,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1118,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1165,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1196,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1205,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1228,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1760,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1809,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -3056,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3249,7 +3255,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); -- cgit v1.2.3 From 78db3412833dc9c479cd17412035f216cfd01a29 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:03 -0800 Subject: mm: add anonymous vma name refcounting While forking a process with high number (64K) of named anonymous vmas the overhead caused by strdup() is noticeable. Experiments with ARM64 Android device show up to 40% performance regression when forking a process with 64k unpopulated anonymous vmas using the max name lengths vs the same process with the same number of anonymous vmas having no name. Introduce anon_vma_name refcounted structure to avoid the overhead of copying vma names during fork() and when splitting named anonymous vmas. When a vma is duplicated, instead of copying the name we increment the refcount of this structure. Multiple vmas can point to the same anon_vma_name as long as they increment the refcount. The name member of anon_vma_name structure is assigned at structure allocation time and is never changed. If vma name changes then the refcount of the original structure is dropped, a new anon_vma_name structure is allocated to hold the new name and the vma pointer is updated to point to the new structure. With this approach the fork() performance regressions is reduced 3-4x times and with usecases using more reasonable number of VMAs (a few thousand) the regressions is not measurable. Link: https://lkml.kernel.org/r/20211019215511.3771969-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Kees Cook Cc: Al Viro Cc: Colin Cross Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: David Rientjes Cc: "Eric W. Biederman" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jan Glauber Cc: Johannes Weiner Cc: John Stultz Cc: Mel Gorman Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Rob Landley Cc: "Serge E. Hallyn" Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 9 ++++++++- mm/madvise.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 44 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 799e2ee626b2..449b6eafc695 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -437,7 +444,7 @@ struct vm_area_struct { unsigned long rb_subtree_last; } shared; /* Serialized by mmap_sem. */ - char *anon_name; + struct anon_vma_name *anon_name; }; /* diff --git a/mm/madvise.c b/mm/madvise.c index 413bbc6e40a0..c63aacbbfa78 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -64,6 +64,29 @@ static int madvise_need_mmap_write(int behavior) } #ifdef CONFIG_ANON_VMA_NAME +static struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +static void vma_anon_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + static inline bool has_vma_anon_name(struct vm_area_struct *vma) { return !vma->vm_file && vma->anon_name; @@ -76,7 +99,7 @@ const char *vma_anon_name(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); - return vma->anon_name; + return vma->anon_name->name; } void dup_vma_anon_name(struct vm_area_struct *orig_vma, @@ -85,34 +108,41 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, if (!has_vma_anon_name(orig_vma)) return; - new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); + kref_get(&orig_vma->anon_name->kref); + new_vma->anon_name = orig_vma->anon_name; } void free_vma_anon_name(struct vm_area_struct *vma) { + struct anon_vma_name *anon_name; + if (!has_vma_anon_name(vma)) return; - kfree(vma->anon_name); + anon_name = vma->anon_name; vma->anon_name = NULL; + kref_put(&anon_name->kref, vma_anon_name_free); } /* mmap_lock should be write-locked */ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) { + const char *anon_name; + if (!name) { free_vma_anon_name(vma); return 0; } - if (vma->anon_name) { + anon_name = vma_anon_name(vma); + if (anon_name) { /* Same name, nothing to do here */ - if (!strcmp(name, vma->anon_name)) + if (!strcmp(name, anon_name)) return 0; free_vma_anon_name(vma); } - vma->anon_name = kstrdup(name, GFP_KERNEL); + vma->anon_name = anon_vma_name_alloc(name); if (!vma->anon_name) return -ENOMEM; -- cgit v1.2.3 From 17fca131cee21724ee953a17c185c14e9533af5b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:07 -0800 Subject: mm: move anon_vma declarations to linux/mm_inline.h The patch to add anonymous vma names causes a build failure in some configurations: include/linux/mm_types.h: In function 'is_same_vma_anon_name': include/linux/mm_types.h:924:37: error: implicit declaration of function 'strcmp' [-Werror=implicit-function-declaration] 924 | return name && vma_name && !strcmp(name, vma_name); | ^~~~~~ include/linux/mm_types.h:22:1: note: 'strcmp' is defined in header ''; did you forget to '#include '? This should not really be part of linux/mm_types.h in the first place, as that header is meant to only contain structure defintions and need a minimum set of indirect includes itself. While the header clearly includes more than it should at this point, let's not make it worse by including string.h as well, which would pull in the expensive (compile-speed wise) fortify-string logic. Move the new functions into a separate header that only needs to be included in a couple of locations. Link: https://lkml.kernel.org/r/20211207125710.2503446-1-arnd@kernel.org Fixes: "mm: add a field to store names for private anonymous memory" Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Colin Cross Cc: Eric Biederman Cc: Kees Cook Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 1 + fs/userfaultfd.c | 1 + include/linux/mm_inline.h | 50 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 48 --------------------------------------------- kernel/fork.c | 1 + mm/madvise.c | 1 + mm/mmap.c | 1 + 7 files changed, 55 insertions(+), 48 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e6998652fd67..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5b2af7b82776..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..47d96d2647ca 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +136,53 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 449b6eafc695..4d5fb84eed5e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -890,52 +890,4 @@ typedef struct { unsigned long val; } swp_entry_t; -#ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling vma_anon_name() and while using - * the returned pointer. - */ -extern const char *vma_anon_name(struct vm_area_struct *vma); - -/* - * mmap_lock should be read-locked for orig_vma->vm_mm. - * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be - * isolated. - */ -extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma); - -/* - * mmap_lock should be write-locked or vma should have been isolated under - * write-locked mmap_lock protection. - */ -extern void free_vma_anon_name(struct vm_area_struct *vma); - -/* mmap_lock should be read-locked */ -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - const char *vma_name = vma_anon_name(vma); - - /* either both NULL, or pointers to same string */ - if (vma_name == name) - return true; - - return name && vma_name && !strcmp(name, vma_name); -} -#else /* CONFIG_ANON_VMA_NAME */ -static inline const char *vma_anon_name(struct vm_area_struct *vma) -{ - return NULL; -} -static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, - struct vm_area_struct *new_vma) {} -static inline void free_vma_anon_name(struct vm_area_struct *vma) {} -static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, - const char *name) -{ - return true; -} -#endif /* CONFIG_ANON_VMA_NAME */ - #endif /* _LINUX_MM_TYPES_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 4cf20b5f2da3..75737e566441 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index c63aacbbfa78..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index 85edb0011453..77733b113c40 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 36090def7bad06a6346f86a7cfdbfda2d138cb64 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 14 Jan 2022 14:06:10 -0800 Subject: mm: move tlb_flush_pending inline helpers to mm_inline.h linux/mm_types.h should only define structure definitions, to make it cheap to include elsewhere. The atomic_t helper function definitions are particularly large, so it's better to move the helpers using those into the existing linux/mm_inline.h and only include that where needed. As a follow-up, we may want to go through all the indirect includes in mm_types.h and reduce them as much as possible. Link: https://lkml.kernel.org/r/20211207125710.2503446-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Al Viro Cc: Stephen Rothwell Cc: Suren Baghdasaryan Cc: Colin Cross Cc: Kees Cook Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Yu Zhao Cc: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 2 +- include/linux/mm.h | 45 -------------- include/linux/mm_inline.h | 86 +++++++++++++++++++++++++++ include/linux/mm_types.h | 129 ++++++++++++++--------------------------- mm/ksm.c | 1 + mm/mapping_dirty_helpers.c | 1 + mm/memory.c | 1 + mm/mmu_gather.c | 1 + mm/pgtable-generic.c | 1 + 9 files changed, 137 insertions(+), 130 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 448cd01eb3ec..5196958aa6ac 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -752,7 +752,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true; return false; diff --git a/include/linux/mm.h b/include/linux/mm.h index 7000442984b9..c17e5cfc1e47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp); */ extern pgprot_t protection_map[16]; -/** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, -}; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 47d96d2647ca..b725839dfe71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include #include #include #include @@ -185,4 +186,89 @@ static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline void init_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); + /* + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. + */ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. + */ + atomic_dec(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4d5fb84eed5e..6a89f128c990 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -692,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_set(&mm->tlb_flush_pending, 0); -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - struct vm_fault; /** @@ -890,4 +806,49 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/ksm.c b/mm/ksm.c index 0662093237e4..f34476ac0a41 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index ea734f248fce..1b0ab8fcfd8b 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..bc80d4effac9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..afb7185ffdc4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf9794..6523fda274e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /* -- cgit v1.2.3 From 64591e8605d6e2fba2ff38e3227645f039b8893f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:14 -0800 Subject: mm: protect free_pgtables with mmap_lock write lock in exit_mmap oom-reaper and process_mrelease system call should protect against races with exit_mmap which can destroy page tables while they walk the VMA tree. oom-reaper protects from that race by setting MMF_OOM_VICTIM and by relying on exit_mmap to set MMF_OOM_SKIP before taking and releasing mmap_write_lock. process_mrelease has to elevate mm->mm_users to prevent such race. Both oom-reaper and process_mrelease hold mmap_read_lock when walking the VMA tree. The locking rules and mechanisms could be simpler if exit_mmap takes mmap_write_lock while executing destructive operations such as free_pgtables. Change exit_mmap to hold the mmap_write_lock when calling unlock_range, free_pgtables and remove_vma. Note also that because oom-reaper checks VM_LOCKED flag, unlock_range() should not be allowed to race with it. Before this patch, remove_vma used to be called with no locks held, however with fput being executed asynchronously and vm_ops->close not being allowed to hold mmap_lock (it is called from __split_vma with mmap_sem held for write), changing that should be fine. In most cases this lock should be uncontended. Previously, Kirill reported ~4% regression caused by a similar change [1]. We reran the same test and although the individual results are quite noisy, the percentiles show lower regression with 1.6% being the worst case [2]. The change allows oom-reaper and process_mrelease to execute safely under mmap_read_lock without worries that exit_mmap might destroy page tables from under them. [1] https://lore.kernel.org/all/20170725141723.ivukwhddk2voyhuc@node.shutemov.name/ [2] https://lore.kernel.org/all/CAJuCfpGC9-c9P40x7oy=jy5SphMcd0o0G_6U1-+JAziGKG6dGA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20211209191325.3069345-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Rik van Riel Cc: Minchan Kim Cc: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Christian Brauner Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Andy Lutomirski Cc: Christian Brauner Cc: Florian Weimer Cc: Jan Engelhardt Cc: Tim Murray Cc: Jason Gunthorpe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 77733b113c40..3f48d0928e6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3149,25 +3149,27 @@ void exit_mmap(struct mm_struct *mm) * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * - * This needs to be done before calling munlock_vma_pages_all(), + * This needs to be done before calling unlock_range(), * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } + mmap_write_lock(mm); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3178,16 +3180,14 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } -- cgit v1.2.3 From ba535c1caf3ee78aa7719e9e4b07a0dc1d153b9e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 14 Jan 2022 14:06:22 -0800 Subject: mm/oom_kill: allow process_mrelease to run under mmap_lock protection With exit_mmap holding mmap_write_lock during free_pgtables call, process_mrelease does not need to elevate mm->mm_users in order to prevent exit_mmap from destrying pagetables while __oom_reap_task_mm is walking the VMA tree. The change prevents process_mrelease from calling the last mmput, which can lead to waiting for IO completion in exit_aio. Link: https://lkml.kernel.org/r/20211209191325.3069345-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Reviewed-by: Jason Gunthorpe Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Christian Brauner Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Florian Weimer Cc: Jan Engelhardt Cc: Jann Horn Cc: Johannes Weiner Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Tim Murray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e52ce0b1465d..3390316c8a32 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1170,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1189,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; -- cgit v1.2.3 From 1eba86c096e35e3cc83de1ad2c26f2d70470211b Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:29 -0800 Subject: mm: change page type prior to adding page table entry Patch series "page table check", v3. Ensure that some memory corruptions are prevented by checking at the time of insertion of entries into user page tables that there is no illegal sharing. We have recently found a problem [1] that existed in kernel since 4.14. The problem was caused by broken page ref count and led to memory leaking from one process into another. The problem was accidentally detected by studying a dump of one process and noticing that one page contains memory that should not belong to this process. There are some other page->_refcount related problems that were recently fixed: [2], [3] which potentially could also lead to illegal sharing. In addition to hardening refcount [4] itself, this work is an attempt to prevent this class of memory corruption issues. It uses a simple state machine that is independent from regular MM logic to check for illegal sharing at time pages are inserted and removed from page tables. [1] https://lore.kernel.org/all/xr9335nxwc5y.fsf@gthelen2.svl.corp.google.com [2] https://lore.kernel.org/all/1582661774-30925-2-git-send-email-akaher@vmware.com [3] https://lore.kernel.org/all/20210622021423.154662-3-mike.kravetz@oracle.com [4] https://lore.kernel.org/all/20211221150140.988298-1-pasha.tatashin@soleen.com This patch (of 4): There are a few places where we first update the entry in the user page table, and later change the struct page to indicate that this is anonymous or file page. In most places, however, we first configure the page metadata and then insert entries into the page table. Page table check, will use the information from struct page to verify the type of entry is inserted. Change the order in all places to first update struct page, and later to update page table. This means that we first do calls that may change the type of page (anon or file): page_move_anon_rmap page_add_anon_rmap do_page_add_anon_rmap page_add_new_anon_rmap page_add_file_rmap hugepage_add_anon_rmap hugepage_add_new_anon_rmap And after that do calls that add entries to the page table: set_huge_pte_at set_pte_at Link: https://lkml.kernel.org/r/20211221154650.1047963-1-pasha.tatashin@soleen.com Link: https://lkml.kernel.org/r/20211221154650.1047963-2-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: David Rientjes Cc: Paul Turner Cc: Wei Xu Cc: Greg Thelen Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Will Deacon Cc: Mike Rapoport Cc: Kees Cook Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Masahiro Yamada Cc: Sami Tolvanen Cc: Dave Hansen Cc: Frederic Weisbecker Cc: "H. Peter Anvin" Cc: Aneesh Kumar K.V Cc: Jiri Slaby Cc: Muchun Song Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 +++--- mm/memory.c | 9 +++++---- mm/migrate.c | 5 ++--- mm/swapfile.c | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..61895cc01d09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr struct page *new_page) { __SetPageUptodate(new_page); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); @@ -5259,10 +5259,10 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_huge_pte_at(mm, haddr, ptep, + make_huge_pte(vma, new_page, 1)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/memory.c b/mm/memory.c index bc80d4effac9..5fea331b1560 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -720,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -735,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(!PageAnon(page)); + set_pte_at(vma->vm_mm, address, ptep, pte); + if (vma->vm_flags & VM_LOCKED) mlock_vma_page(page); @@ -3640,8 +3640,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3652,6 +3650,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --git a/mm/migrate.c b/mm/migrate.c index cf25b00f03c8..6aa4b5326784 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -236,20 +236,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e1..e64207e2ef1d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1917,14 +1917,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); -- cgit v1.2.3 From 08d5b29eac7dd5e6c79b66d390ecbb9219e05931 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:33 -0800 Subject: mm: ptep_clear() page table helper We have ptep_get_and_clear() and ptep_get_and_clear_full() helpers to clear PTE from user page tables, but there is no variant for simple clear of a present PTE from user page tables without using a low level pte_clear() which can be either native or para-virtualised. Add a new ptep_clear() that can be used in common code to clear PTEs from page table. We will need this call later in order to add a hook for page table check. Link: https://lkml.kernel.org/r/20211221154650.1047963-3-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 6 ++++-- include/linux/pgtable.h | 8 ++++++++ mm/debug_vm_pgtable.c | 2 +- mm/khugepaged.c | 12 ++---------- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index b3166c33db39..f8b225fc9190 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -66,9 +66,11 @@ PTE Page Table Helpers +---------------------------+--------------------------------------------------+ | pte_mknotpresent | Invalidates a mapped PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears a PTE | +| ptep_clear | Clears a PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears a PTE | +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | +---------------------------+--------------------------------------------------+ | ptep_test_and_clear_young | Clears young from a PTE | +---------------------------+--------------------------------------------------+ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..bc8713a76e03 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2a2b24e87877..a7ac97c76762 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); barrier(); - pte_clear(args->mm, args->vaddr, args->ptep); + ptep_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e99101162f1a..9d40dd8890e5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -756,11 +756,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -774,11 +770,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); -- cgit v1.2.3 From df4e817b710809425d899340dbfa8504a3ca4ba5 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Fri, 14 Jan 2022 14:06:37 -0800 Subject: mm: page table check Check user page table entries at the time they are added and removed. Allows to synchronously catch memory corruption issues related to double mapping. When a pte for an anonymous page is added into page table, we verify that this pte does not already point to a file backed page, and vice versa if this is a file backed page that is being added we verify that this page does not have an anonymous mapping We also enforce that read-only sharing for anonymous pages is allowed (i.e. cow after fork). All other sharing must be for file pages. Page table check allows to protect and debug cases where "struct page" metadata became corrupted for some reason. For example, when refcnt or mapcount become invalid. Link: https://lkml.kernel.org/r/20211221154650.1047963-4-pasha.tatashin@soleen.com Signed-off-by: Pasha Tatashin Cc: Aneesh Kumar K.V Cc: Dave Hansen Cc: David Rientjes Cc: Frederic Weisbecker Cc: Greg Thelen Cc: "H. Peter Anvin" Cc: Hugh Dickins Cc: Ingo Molnar Cc: Jiri Slaby Cc: Jonathan Corbet Cc: Kees Cook Cc: Masahiro Yamada Cc: Mike Rapoport Cc: Muchun Song Cc: Paul Turner Cc: Peter Zijlstra Cc: Sami Tolvanen Cc: Thomas Gleixner Cc: Wei Xu Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/index.rst | 1 + Documentation/vm/page_table_check.rst | 56 +++++++ MAINTAINERS | 9 ++ arch/Kconfig | 3 + include/linux/page_table_check.h | 147 ++++++++++++++++++ mm/Kconfig.debug | 24 +++ mm/Makefile | 1 + mm/page_alloc.c | 4 + mm/page_ext.c | 4 + mm/page_table_check.c | 270 ++++++++++++++++++++++++++++++++++ 10 files changed, 519 insertions(+) create mode 100644 Documentation/vm/page_table_check.rst create mode 100644 include/linux/page_table_check.h create mode 100644 mm/page_table_check.c (limited to 'mm') diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index b1826ca2c576..932440805453 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -31,6 +31,7 @@ algorithms. If you are looking for advice on simply allocating memory, see the page_migration page_frags page_owner + page_table_check remap_file_pages slub split_page_table_lock diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst new file mode 100644 index 000000000000..81f521ff7ea7 --- /dev/null +++ b/Documentation/vm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to hardern the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/MAINTAINERS b/MAINTAINERS index dd36acc87ce6..fbdb860c0b8b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14387,6 +14387,15 @@ F: include/net/page_pool.h F: include/trace/events/page_pool.h F: net/core/page_pool.c +PAGE TABLE CHECK +M: Pasha Tatashin +M: Andrew Morton +L: linux-mm@kvack.org +S: Maintained +F: Documentation/vm/page_table_check.rst +F: include/linux/page_table_check.h +F: mm/page_table_check.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Kenneth Chan L: platform-driver-x86@vger.kernel.org diff --git a/arch/Kconfig b/arch/Kconfig index d3c4ab249e9c..4568b6b70b5d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool +config ARCH_SUPPORTS_PAGE_TABLE_CHECK + bool + config ARCH_SPLIT_ARG64 bool help diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h new file mode 100644 index 000000000000..38cace1da7b6 --- /dev/null +++ b/include/linux/page_table_check.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#ifndef __LINUX_PAGE_TABLE_CHECK_H +#define __LINUX_PAGE_TABLE_CHECK_H + +#ifdef CONFIG_PAGE_TABLE_CHECK +#include + +extern struct static_key_true page_table_check_disabled; +extern struct page_ext_operations page_table_check_ops; + +void __page_table_check_zero(struct page *page, unsigned int order); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear(mm, addr, pte); +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear(mm, addr, pmd); +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_clear(mm, addr, pud); +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_set(mm, addr, ptep, pte); +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_set(mm, addr, pmdp, pmd); +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_set(mm, addr, pudp, pud); +} + +#else + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ +} + +#endif /* CONFIG_PAGE_TABLE_CHECK */ +#endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..5bd5bb097252 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,30 @@ config PAGE_OWNER If unsure, say N. +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index d6c0042e3aa0..5c5a3a480fa6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -112,6 +112,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d59023a676ed..806f317c2e7e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include #include @@ -1307,6 +1308,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1346,6 +1348,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -2420,6 +2423,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, diff --git a/mm/page_ext.c b/mm/page_ext.c index 6242afb24d84..bee3240604dc 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * struct page extension @@ -75,6 +76,9 @@ static struct page_ext_operations *page_ext_ops[] = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000000..7504e7caa2a1 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + __page_table_check_enabled = true; + else if (strcmp(buf, "off") == 0) + __page_table_check_enabled = false; + + return 0; +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && + (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && + (pud_val(pud) & _PAGE_USER); +} + +/* + * An enty is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * A new enty is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + int i; + + BUG_ON(!page_ext); + for (i = 0; i < (1 << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + pte_t old_pte; + + if (&init_mm == mm) + return; + + old_pte = *ptep; + if (pte_user_accessible_page(old_pte)) { + page_table_check_clear(mm, addr, pte_pfn(old_pte), + PAGE_SIZE >> PAGE_SHIFT); + } + + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old_pmd; + + if (&init_mm == mm) + return; + + old_pmd = *pmdp; + if (pmd_user_accessible_page(old_pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(old_pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + pud_t old_pud; + + if (&init_mm == mm) + return; + + old_pud = *pudp; + if (pud_user_accessible_page(old_pud)) { + page_table_check_clear(mm, addr, pud_pfn(old_pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); -- cgit v1.2.3 From 020e87650af9f43683546729f959fdc78422a4b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:44 -0800 Subject: mm: remove last argument of reuse_swap_page() None of the callers care about the total_map_swapcount() any more. Link: https://lkml.kernel.org/r/20211220205943.456187-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Linus Torvalds Reviewed-by: William Kucharski Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 +++--- mm/huge_memory.c | 2 +- mm/khugepaged.c | 2 +- mm/memory.c | 2 +- mm/swapfile.c | 8 +------- 5 files changed, 7 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19..bdccbf1efa61 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page, NULL) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5483347291c..b61fbe95c856 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9d40dd8890e5..698ea19775ac 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -681,7 +681,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. diff --git a/mm/memory.c b/mm/memory.c index 5fea331b1560..571d02f419ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3627,7 +3627,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index e64207e2ef1d..31d13a393cf0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1668,12 +1668,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { int count, total_mapcount, total_swapcount; @@ -1682,8 +1678,6 @@ bool reuse_swap_page(struct page *page, int *total_map_swapcount) return false; count = page_trans_huge_map_swapcount(page, &total_mapcount, &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit v1.2.3 From 66c7f7a6ac6624fc7e226d43913e10f1f047f579 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:48 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_map_swapcount() Now that we don't report it to the caller of reuse_swap_page(), we don't need to request it from page_trans_huge_map_swapcount(). Link: https://lkml.kernel.org/r/20211220205943.456187-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 31d13a393cf0..a93f5b5fc8b6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page, NULL); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1671,13 +1664,12 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, */ bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ -- cgit v1.2.3 From d08d2b62510e2407cf939e693aefd179dc114913 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:06:51 -0800 Subject: mm: remove the total_mapcount argument from page_trans_huge_mapcount() All callers pass NULL, so we can stop calculating the value we would store in it. Link: https://lkml.kernel.org/r/20211220205943.456187-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Linus Torvalds Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 10 +++------- include/linux/swap.h | 2 +- mm/huge_memory.c | 30 ++++++++++-------------------- mm/swapfile.c | 2 +- 4 files changed, 15 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4d7245e6802a..cef65f9cbdf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -799,19 +799,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif diff --git a/include/linux/swap.h b/include/linux/swap.h index bdccbf1efa61..1d38d9475c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -681,7 +681,7 @@ static inline int swp_swapcount(swp_entry_t entry) } #define reuse_swap_page(page) \ - (page_trans_huge_mapcount(page, NULL) == 1) + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b61fbe95c856..6ed86a8f6a5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ diff --git a/mm/swapfile.c b/mm/swapfile.c index a93f5b5fc8b6..caa9f81a0d15 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1619,7 +1619,7 @@ static int page_trans_huge_map_swapcount(struct page *page, swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return swapcount + page_trans_huge_mapcount(page, NULL); + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); -- cgit v1.2.3 From cc6266f0322fa9f7f4543564759e881d989ad866 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 14 Jan 2022 14:06:54 -0800 Subject: mm/dmapool.c: revert "make dma pool to use kmalloc_node" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 2618c60b8b5836 ("dma: make dma pool to use kmalloc_node"). While working myself into the dmapool code I've found this little odd kmalloc_node(). What basically happens here is that we allocate the housekeeping structure on the numa node where the device is attached to. Since the device is never doing DMA to or from that memory this doesn't seem to make sense at all. So while this doesn't seem to cause much harm it's probably cleaner to revert the change for consistency. Link: https://lkml.kernel.org/r/20211221110724.97664-1-christian.koenig@amd.com Signed-off-by: Christian König Cc: Yinghai Lu Cc: Andi Kleen Cc: Christoph Lameter Cc: David Rientjes Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index 64b537b3ccb0..a7eb5d0eb2da 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; -- cgit v1.2.3 From 451769ebb7e792c3404db53b3c2a422990de654e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:06:57 -0800 Subject: mm/vmalloc: alloc GFP_NO{FS,IO} for vmalloc Patch series "extend vmalloc support for constrained allocations", v2. Based on a recent discussion with Dave and Neil [1] I have tried to implement NOFS, NOIO, NOFAIL support for the vmalloc to make life of kvmalloc users easier. A requirement for NOFAIL support for kvmalloc was new to me but this seems to be really needed by the xfs code. NOFS/NOIO was a known and a long term problem which was hoped to be handled by the scope API. Those scope should have been used at the reclaim recursion boundaries both to document them and also to remove the necessity of NOFS/NOIO constrains for all allocations within that scope. Instead workarounds were developed to wrap a single allocation instead (like ceph_kvmalloc). First patch implements NOFS/NOIO support for vmalloc. The second one adds NOFAIL support and the third one bundles all together into kvmalloc and drops ceph_kvmalloc which can use kvmalloc directly now. [1] http://lkml.kernel.org/r/163184741778.29351.16920832234899124642.stgit@noble.brown This patch (of 4): vmalloc historically hasn't supported GFP_NO{FS,IO} requests because page table allocations do not support externally provided gfp mask and performed GFP_KERNEL like allocations. Since few years we have scope (memalloc_no{fs,io}_{save,restore}) APIs to enforce NOFS and NOIO constrains implicitly to all allocators within the scope. There was a hope that those scopes would be defined on a higher level when the reclaim recursion boundary starts/stops (e.g. when a lock required during the memory reclaim is required etc.). It seems that not all NOFS/NOIO users have adopted this approach and instead they have taken a workaround approach to wrap a single [k]vmalloc allocation by a scope API. These workarounds do not serve the purpose of a better reclaim recursion documentation and reduction of explicit GFP_NO{FS,IO} usege so let's just provide them with the semantic they are asking for without a need for workarounds. Add support for GFP_NOFS and GFP_NOIO to vmalloc directly. All internal allocations already comply with the given gfp_mask. The only current exception is vmap_pages_range which maps kernel page tables. Infer the proper scope API based on the given gfp mask. [sfr@canb.auug.org.au: mm/vmalloc.c needs linux/sched/mm.h] Link: https://lkml.kernel.org/r/20211217232641.0148710c@canb.auug.org.au Link: https://lkml.kernel.org/r/20211122153233.9924-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20211122153233.9924-2-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Stephen Rothwell Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Neil Brown Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jeff Layton Cc: Dave Chinner Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 80c6de4c425f..8ed93510e50e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2928,6 +2929,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; + unsigned int flags; + int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2976,8 +2979,24 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift) < 0) { + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); -- cgit v1.2.3 From 9376130c390a76fac2788a5d6e1a149017b4ab50 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:01 -0800 Subject: mm/vmalloc: add support for __GFP_NOFAIL Dave Chinner has mentioned that some of the xfs code would benefit from kvmalloc support for __GFP_NOFAIL because they have allocations that cannot fail and they do not fit into a single page. The large part of the vmalloc implementation already complies with the given gfp flags so there is no work for those to be done. The area and page table allocations are an exception to that. Implement a retry loop for those. Add a short sleep before retrying. 1 jiffy is a completely random timeout. Ideally the retry would wait for an explicit event - e.g. a change to the vmalloc space change if the failure was caused by the space fragmentation or depletion. But there are multiple different reasons to retry and this could become much more complex. Keep the retry simple for now and just sleep to prevent from hogging CPUs. Link: https://lkml.kernel.org/r/20211122153233.9924-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 8ed93510e50e..d1bebd197c8c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2847,6 +2847,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2864,12 +2866,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolcy want to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(gfp, + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(gfp, nid, + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -2924,6 +2926,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t orig_gfp_mask = gfp_mask; + bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; @@ -2988,8 +2991,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) flags = memalloc_noio_save(); - ret = vmap_pages_range(addr, addr + size, prot, area->pages, + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) memalloc_nofs_restore(flags); @@ -3084,9 +3091,14 @@ again: VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, vm_struct allocation failed", - real_size); + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } goto fail; } -- cgit v1.2.3 From 30d3f01191d305c99e8b3f8b1b328fc852270c95 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:04 -0800 Subject: mm/vmalloc: be more explicit about supported gfp flags. Commit b7d90e7a5ea8 ("mm/vmalloc: be more explicit about supported gfp flags") has been merged prematurely without the rest of the series and without addressed review feedback from Neil. Fix that up now. Only wording is changed slightly. Link: https://lkml.kernel.org/r/20211122153233.9924-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d1bebd197c8c..4165304d3547 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3031,12 +3031,14 @@ fail: * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL would be a preferred allocation mode - * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not - * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka - * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka - * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). - * __GFP_NOWARN can be used to suppress error messages about failures. + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. -- cgit v1.2.3 From a421ef303008b0ceee2cfc625c3246fa7654b0ca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:07 -0800 Subject: mm: allow !GFP_KERNEL allocations for kvmalloc Support for GFP_NO{FS,IO} and __GFP_NOFAIL has been implemented by previous patches so we can allow the support for kvmalloc. This will allow some external users to simplify or completely remove their helpers. GFP_NOWAIT semantic hasn't been supported so far but it hasn't been explicitly documented so let's add a note about that. ceph_kvmalloc is the first helper to be dropped and changed to kvmalloc. Link: https://lkml.kernel.org/r/20211122153233.9924-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ceph/libceph.h | 1 - mm/util.c | 15 ++++----------- net/ceph/buffer.c | 4 ++-- net/ceph/ceph_common.c | 27 --------------------------- net/ceph/crypto.c | 2 +- net/ceph/messenger.c | 2 +- net/ceph/messenger_v2.c | 2 +- net/ceph/osdmap.c | 12 ++++++------ 8 files changed, 15 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..309acbcb5a8a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); struct fs_parameter; struct fc_log; diff --git a/mm/util.c b/mm/util.c index 741ba32a43ac..7e43369064c8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -563,13 +560,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) gfp_t kmalloc_flags = flags; void *ret; - /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore @@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 5622763ad402..7e51f128045d 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -7,7 +7,7 @@ #include #include -#include /* for ceph_kvmalloc */ +#include /* for kvmalloc */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = ceph_kvmalloc(len, gfp); + b->vec.iov_base = kvmalloc(len, gfp); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..9441b4a4912b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,33 +190,6 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); -/* - * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are - * compatible with (a superset of) GFP_KERNEL. This is because while the - * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. - * - * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. - */ -void *ceph_kvmalloc(size_t size, gfp_t flags) -{ - void *p; - - if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { - p = kvmalloc(size, flags); - } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { - unsigned int nofs_flag = memalloc_nofs_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - } else { - unsigned int noio_flag = memalloc_noio_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_noio_restore(noio_flag); - } - - return p; -} - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 92d89b331645..051d22c0e4ad 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -147,7 +147,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* - * Should be used for buffers allocated with ceph_kvmalloc(). + * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..7b891be799d2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1920,7 +1920,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, /* front */ if (front_len) { - m->front.iov_base = ceph_kvmalloc(front_len, flags); + m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index cc40ce4e02fb..c4099b641b38 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -308,7 +308,7 @@ static void *alloc_conn_buf(struct ceph_connection *con, int len) if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) return NULL; - buf = ceph_kvmalloc(len, GFP_NOIO); + buf = kvmalloc(len, GFP_NOIO); if (!buf) return NULL; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 75b738083523..2823bb3cff55 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -980,7 +980,7 @@ static struct crush_work *alloc_workspace(const struct crush_map *c) work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - work = ceph_kvmalloc(work_size, GFP_NOIO); + work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; @@ -1190,9 +1190,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (max == map->max_osd) return 0; - state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); - weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); - addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); @@ -1222,7 +1222,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; @@ -1503,7 +1503,7 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = ceph_kvmalloc( + map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) -- cgit v1.2.3 From 704687deaae768a818d7da0584ee021793a97684 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:11 -0800 Subject: mm: make slab and vmalloc allocators __GFP_NOLOCKDEP aware sl?b and vmalloc allocators reduce the given gfp mask for their internal needs. For that they use GFP_RECLAIM_MASK to preserve the reclaim behavior and constrains. __GFP_NOLOCKDEP is not a part of that mask because it doesn't really control the reclaim behavior strictly speaking. On the other hand it tells the underlying page allocator to disable reclaim recursion detection so arguably it should be part of the mask. Having __GFP_NOLOCKDEP in the mask will not alter the behavior in any form so this change is safe pretty much by definition. It also adds a support for this flag to SL?B and vmalloc allocators which will in turn allow its use to kvmalloc as well. A lack of the support has been noticed recently in http://lkml.kernel.org/r/20211119225435.GZ449541@dread.disaster.area Link: https://lkml.kernel.org/r/YZ9XtLY4AEjVuiEI@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Sebastian Andrzej Siewior Acked-by: Dave Chinner Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Dave Chinner Cc: Ilya Dryomov Cc: Jeff Layton Cc: Neil Brown Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index bdac62e1eca7..c5834cc28a44 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -21,7 +21,7 @@ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) -- cgit v1.2.3 From ca831f29f8f25c97182e726429b38c0802200c8f Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Fri, 14 Jan 2022 14:07:24 -0800 Subject: mm: page_alloc: fix building error on -Werror=array-compare Arthur Marsh reported we would hit the error below when building kernel with gcc-12: CC mm/page_alloc.o mm/page_alloc.c: In function `mem_init_print_info': mm/page_alloc.c:8173:27: error: comparison between two arrays [-Werror=array-compare] 8173 | if (start <= pos && pos < end && size > adj) \ | In C++20, the comparision between arrays should be warned. Link: https://lkml.kernel.org/r/20211125130928.32465-1-sxwjean@me.com Signed-off-by: Xiongwei Song Reported-by: Arthur Marsh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 806f317c2e7e..c4ef450ac442 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8228,7 +8228,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) -- cgit v1.2.3 From be1a13eb51077b2ec5f7f4306f93dfece503a3f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 14 Jan 2022 14:07:27 -0800 Subject: mm: drop node from alloc_pages_vma alloc_pages_vma is meant to allocate a page with a vma specific memory policy. The initial node parameter is always a local node so it is pointless to waste a function argument for this. Drop the parameter. Link: https://lkml.kernel.org/r/YaSnlv4QpryEpesG@dhcp22.suse.cz Signed-off-by: Michal Hocko Cc: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: "Huang, Ying" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 8 ++++---- mm/mempolicy.c | 3 ++- mm/shmem.c | 3 +-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8fcc38467af6..78b58448f796 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -598,9 +598,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -610,14 +610,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 679f47b3a079..ed7d15acb6a2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2084,9 +2084,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; diff --git a/mm/shmem.c b/mm/shmem.c index 8f940552c182..0700e9acf53b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1564,8 +1564,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); -- cgit v1.2.3 From eaab8e753632b8e961701d02a5bb398c820f309c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:07:33 -0800 Subject: mm/page_alloc.c: modify the comment section for alloc_contig_pages() Clarify that the alloc_contig_pages() allocated range will always be aligned to the requested nr_pages. Link: https://lkml.kernel.org/r/1639545478-12160-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c4ef450ac442..3eace0065ecc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9272,8 +9272,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. -- cgit v1.2.3 From 62b3107073646e0946bd97ff926832bafb846d17 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:37 -0800 Subject: mm_zone: add function to check if managed dma zone exists Patch series "Handle warning of allocation failure on DMA zone w/o managed pages", v4. **Problem observed: On x86_64, when crash is triggered and entering into kdump kernel, page allocation failure can always be seen. --------------------------------- DMA: preallocated 128 KiB GFP_KERNEL pool for atomic allocations swapper/0: page allocation failure: order:5, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 1 Comm: swapper/0 Call Trace: dump_stack+0x7f/0xa1 warn_alloc.cold+0x72/0xd6 ...... __alloc_pages+0x24d/0x2c0 ...... dma_atomic_pool_init+0xdb/0x176 do_one_initcall+0x67/0x320 ? rcu_read_lock_sched_held+0x3f/0x80 kernel_init_freeable+0x290/0x2dc ? rest_init+0x24f/0x24f kernel_init+0xa/0x111 ret_from_fork+0x22/0x30 Mem-Info: ------------------------------------ ***Root cause: In the current kernel, it assumes that DMA zone must have managed pages and try to request pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that this low 1M won't be added into buddy allocator to become managed pages of DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. ***Investigation: This failure happens since below commit merged into linus's tree. 1a6a9044b967 x86/setup: Remove CONFIG_X86_RESERVE_LOW and reservelow= options 23721c8e92f7 x86/crash: Remove crash_reserve_low_1M() f1d4d47c5851 x86/setup: Always reserve the first 1M of RAM 7c321eb2b843 x86/kdump: Remove the backup region handling 6f599d84231f x86/kdump: Always reserve the low 1M when the crashkernel option is specified Before them, on x86_64, the low 640K area will be reused by kdump kernel. So in kdump kernel, the content of low 640K area is copied into a backup region for dumping before jumping into kdump. Then except of those firmware reserved region in [0, 640K], the left area will be added into buddy allocator to become available managed pages of DMA zone. However, after above commits applied, in kdump kernel of x86_64, the low 1M is reserved by memblock, but not released to buddy allocator. So any later page allocation requested from DMA zone will fail. At the beginning, if crashkernel is reserved, the low 1M need be locked down because AMD SME encrypts memory making the old backup region mechanims impossible when switching into kdump kernel. Later, it was also observed that there are BIOSes corrupting memory under 1M. To solve this, in commit f1d4d47c5851, the entire region of low 1M is always reserved after the real mode trampoline is allocated. Besides, recently, Intel engineer mentioned their TDX (Trusted domain extensions) which is under development in kernel also needs to lock down the low 1M. So we can't simply revert above commits to fix the page allocation failure from DMA zone as someone suggested. ***Solution: Currently, only DMA atomic pool and dma-kmalloc will initialize and request page allocation with GFP_DMA during bootup. So only initializ DMA atomic pool when DMA zone has available managed pages, otherwise just skip the initialization. For dma-kmalloc(), for the time being, let's mute the warning of allocation failure if requesting pages from DMA zone while no manged pages. Meanwhile, change code to use dma_alloc_xx/dma_map_xx API to replace kmalloc(GFP_DMA), or do not use GFP_DMA when calling kmalloc() if not necessary. Christoph is posting patches to fix those under drivers/scsi/. Finally, we can remove the need of dma-kmalloc() as people suggested. This patch (of 3): In some places of the current kernel, it assumes that dma zone must have managed pages if CONFIG_ZONE_DMA is enabled. While this is not always true. E.g in kdump kernel of x86_64, only low 1M is presented and locked down at very early stage of boot, so that there's no managed pages at all in DMA zone. This exception will always cause page allocation failure if page is requested from DMA zone. Here add function has_managed_dma() and the relevant helper functions to check if there's DMA zone with managed pages. It will be used in later patches. Link: https://lkml.kernel.org/r/20211223094435.248523-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20211223094435.248523-2-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Reviewed-by: David Hildenbrand Acked-by: John Donnelly Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: David Laight Cc: Borislav Petkov Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 9 +++++++++ mm/page_alloc.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 936dc0b6c226..aed44e9b5d89 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1047,6 +1047,15 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3eace0065ecc..e2ad2303f634 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9518,3 +9518,18 @@ bool take_page_off_buddy(struct page *page) return ret; } #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ -- cgit v1.2.3 From c4dc63f0032c77464fbd4e7a6afc22fa6913c4a7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 14 Jan 2022 14:07:44 -0800 Subject: mm/page_alloc.c: do not warn allocation failure on zone DMA if no managed pages In kdump kernel of x86_64, page allocation failure is observed: kworker/u2:2: page allocation failure: order:0, mode:0xcc1(GFP_KERNEL|GFP_DMA), nodemask=(null),cpuset=/,mems_allowed=0 CPU: 0 PID: 55 Comm: kworker/u2:2 Not tainted 5.16.0-rc4+ #5 Hardware name: AMD Dinar/Dinar, BIOS RDN1505B 06/05/2013 Workqueue: events_unbound async_run_entry_fn Call Trace: dump_stack_lvl+0x48/0x5e warn_alloc.cold+0x72/0xd6 __alloc_pages_slowpath.constprop.0+0xc69/0xcd0 __alloc_pages+0x1df/0x210 new_slab+0x389/0x4d0 ___slab_alloc+0x58f/0x770 __slab_alloc.constprop.0+0x4a/0x80 kmem_cache_alloc_trace+0x24b/0x2c0 sr_probe+0x1db/0x620 ...... device_add+0x405/0x920 ...... __scsi_add_device+0xe5/0x100 ata_scsi_scan_host+0x97/0x1d0 async_run_entry_fn+0x30/0x130 process_one_work+0x1e8/0x3c0 worker_thread+0x50/0x3b0 ? rescuer_thread+0x350/0x350 kthread+0x16b/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Mem-Info: ...... The above failure happened when calling kmalloc() to allocate buffer with GFP_DMA. It requests to allocate slab page from DMA zone while no managed pages at all in there. sr_probe() --> get_capabilities() --> buffer = kmalloc(512, GFP_KERNEL | GFP_DMA); Because in the current kernel, dma-kmalloc will be created as long as CONFIG_ZONE_DMA is enabled. However, kdump kernel of x86_64 doesn't have managed pages on DMA zone since commit 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified"). The failure can be always reproduced. For now, let's mute the warning of allocation failure if requesting pages from DMA zone while no managed pages. [akpm@linux-foundation.org: fix warning] Link: https://lkml.kernel.org/r/20211223094435.248523-4-bhe@redhat.com Fixes: 6f599d84231f ("x86/kdump: Always reserve the low 1M when the crashkernel option is specified") Signed-off-by: Baoquan He Acked-by: John Donnelly Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Borislav Petkov Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Laight Cc: Marek Szyprowski Cc: Robin Murphy Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e2ad2303f634..99fc65c532f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4218,7 +4218,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); -- cgit v1.2.3 From f47761999052b1cc987dd3e3d3adf47997358fc0 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Fri, 14 Jan 2022 14:07:48 -0800 Subject: hugetlb: add hugetlb.*.numa_stat file For hugetlb backed jobs/VMs it's critical to understand the numa information for the memory backing these jobs to deliver optimal performance. Currently this technically can be queried from /proc/self/numa_maps, but there are significant issues with that. Namely: 1. Memory can be mapped or unmapped. 2. numa_maps are per process and need to be aggregated across all processes in the cgroup. For shared memory this is more involved as the userspace needs to make sure it doesn't double count shared mappings. 3. I believe querying numa_maps needs to hold the mmap_lock which adds to the contention on this lock. For these reasons I propose simply adding hugetlb.*.numa_stat file, which shows the numa information of the cgroup similarly to memory.numa_stat. On cgroup-v2: cat /sys/fs/cgroup/unified/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 On cgroup-v1: cat /sys/fs/cgroup/hugetlb/test/hugetlb.2MB.numa_stat total=2097152 N0=2097152 N1=0 hierarichal_total=2097152 N0=2097152 N1=0 This patch was tested manually by allocating hugetlb memory and querying the hugetlb.*.numa_stat file of the cgroup and its parents. [colin.i.king@googlemail.com: fix spelling mistake "hierarichal" -> "hierarchical"] Link: https://lkml.kernel.org/r/20211125090635.23508-1-colin.i.king@gmail.com [keescook@chromium.org: fix copy/paste array assignment] Link: https://lkml.kernel.org/r/20211203065647.2819707-1-keescook@chromium.org Link: https://lkml.kernel.org/r/20211123001020.4083653-1-almasrymina@google.com Signed-off-by: Mina Almasry Signed-off-by: Colin Ian King Signed-off-by: Kees Cook Reviewed-by: Shakeel Butt Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Shuah Khan Cc: Miaohe Lin Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Jue Wang Cc: Yang Yao Cc: Joanna Li Cc: Cannon Matthews Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v1/hugetlb.rst | 4 + Documentation/admin-guide/cgroup-v2.rst | 5 + include/linux/hugetlb.h | 4 +- include/linux/hugetlb_cgroup.h | 7 ++ mm/hugetlb_cgroup.c | 133 ++++++++++++++++++++++-- 5 files changed, 141 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 338f2c7d7a1c..0fa724d82abb 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -29,12 +29,14 @@ Brief summary of control files:: hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb..failcnt # show the number of allocation failure due to HugeTLB usage limit + hugetlb..numa_stat # show the numa information of the hugetlb memory charged to this cgroup For a system supporting three hugepage sizes (64k, 32M and 1G), the control files include:: hugetlb.1GB.limit_in_bytes hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.numa_stat hugetlb.1GB.usage_in_bytes hugetlb.1GB.failcnt hugetlb.1GB.rsvd.limit_in_bytes @@ -43,6 +45,7 @@ files include:: hugetlb.1GB.rsvd.failcnt hugetlb.64KB.limit_in_bytes hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.numa_stat hugetlb.64KB.usage_in_bytes hugetlb.64KB.failcnt hugetlb.64KB.rsvd.limit_in_bytes @@ -51,6 +54,7 @@ files include:: hugetlb.64KB.rsvd.failcnt hugetlb.32MB.limit_in_bytes hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.numa_stat hugetlb.32MB.usage_in_bytes hugetlb.32MB.failcnt hugetlb.32MB.rsvd.limit_in_bytes diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4f400b03dddf..5aa368d165da 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2266,6 +2266,11 @@ HugeTLB Interface Files are local to the cgroup i.e. not hierarchical. The file modified event generated on this file reflects only the local events. + hugetlb..numa_stat + Similar to memory.numa_stat, it shows the numa information of the + hugetlb pages of in this cgroup. Only active in + use hugetlb pages are included. The per-node values are in bytes. + Misc ---- diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00351ccb49a3..d1897a69c540 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -622,8 +622,8 @@ struct hstate { #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files_dfl[7]; - struct cftype cgroup_files_legacy[9]; + struct cftype cgroup_files_dfl[8]; + struct cftype cgroup_files_legacy[10]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index ba025ae27882..379344828e78 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -36,6 +36,11 @@ enum hugetlb_memory_event { HUGETLB_NR_MEMORY_EVENTS, }; +struct hugetlb_cgroup_per_node { + /* hugetlb usage in pages over all hstates. */ + unsigned long usage[HUGE_MAX_HSTATE]; +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; @@ -57,6 +62,8 @@ struct hugetlb_cgroup { /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; + + struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 79d93534ef1e..f9942841df18 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } } +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to -1 for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : -1; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cgroup; - - h_cgroup = hugetlb_cgroup_from_css(css); - kfree(h_cgroup); + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* @@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, return; __set_hugetlb_cgroup(page, h_cg, rsvd); - return; + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, @@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (rsvd) css_put(&h_cg->css); - - return; + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, @@ -418,6 +466,59 @@ enum { RES_RSVD_FAILCNT, }; +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, -- cgit v1.2.3 From e9ea874a8ffb0f8ebed4f4981531a32c5b663d79 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Fri, 14 Jan 2022 14:07:55 -0800 Subject: mm/vmstat: add events for THP max_ptes_* exceeds There are interfaces to adjust max_ptes_none, max_ptes_swap, max_ptes_shared values, see /sys/kernel/mm/transparent_hugepage/khugepaged/. But system administrator may not know which value is the best. So Add those events to support adjusting max_ptes_* to suitable values. For example, if default max_ptes_swap value causes too much failures, and system uses zram whose IO is fast, administrator could increase max_ptes_swap until THP_SCAN_EXCEED_SWAP_PTE not increase anymore. Link: https://lkml.kernel.org/r/20211225094036.574157-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: "Huang, Ying" Cc: Dave Hansen Cc: Minchan Kim Cc: Saravanan D Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 3 +++ mm/khugepaged.c | 7 +++++++ mm/vmstat.c | 3 +++ 3 files changed, 13 insertions(+) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..7b2363388bfa 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SCAN_EXCEED_NONE_PTE, + THP_SCAN_EXCEED_SWAP_PTE, + THP_SCAN_EXCEED_SHARED_PTE, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 698ea19775ac..02071f213c58 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } @@ -1253,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@ -1262,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@ -1290,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -2000,6 +2005,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; @@ -2046,6 +2052,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --git a/mm/vmstat.c b/mm/vmstat.c index d701c335628c..4057372745d0 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1353,6 +1353,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif -- cgit v1.2.3 From e4b424b7ec8791087375bb1f2480a3ba05d21e0b Mon Sep 17 00:00:00 2001 From: Gang Li Date: Fri, 14 Jan 2022 14:08:07 -0800 Subject: vmscan: make drop_slab_node static drop_slab_node is only used in drop_slab. So remove it's declaration from header file and add keyword static for it's definition. Link: https://lkml.kernel.org/r/20211111062445.5236-1-ligang.bdlg@bytedance.com Signed-off-by: Gang Li Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 - mm/vmscan.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index cef65f9cbdf2..eb67eb699b78 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3122,7 +3122,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 diff --git a/mm/vmscan.c b/mm/vmscan.c index 700434db5735..090bfb605ecf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -951,7 +951,7 @@ out: return freed; } -void drop_slab_node(int nid) +static void drop_slab_node(int nid) { unsigned long freed; int shift = 0; -- cgit v1.2.3 From 721fb891ad0b3956d5c168b2931e3e5e4fb7ca40 Mon Sep 17 00:00:00 2001 From: Chen Wandun Date: Fri, 14 Jan 2022 14:08:10 -0800 Subject: mm/page_isolation: unset migratetype directly for non Buddy page In unset_migratetype_isolate(), we can bypass the call to move_freepages_block() for non-buddy pages. It will save a few cpu cycles for some situations such as cma and hugetlb when allocating continue pages, in these situation function alloc_contig_pages will be called. alloc_contig_pages __alloc_contig_migrate_range isolate_freepages_range ==> pages has been remove from buddy undo_isolate_page_range unset_migratetype_isolate ==> can directly set migratetype [osalvador@suse.de: changelog tweak] Link: https://lkml.kernel.org/r/20211229033649.2760586-1-chenwandun@huawei.com Fixes: 3c605096d315 ("mm/page_alloc: restrict max order of merging on isolated pageblock") Signed-off-by: Chen Wandun Reviewed-by: Oscar Salvador Cc: Vlastimil Babka Cc: Joonsoo Kim Cc: Wang Kefeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f67c4c70f17f..6a0ddda6b3c5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page) { + if (!isolated_page && PageBuddy(page)) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } -- cgit v1.2.3 From c04551162167368022a61899843821bbf015b473 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:14 -0800 Subject: mm/mempolicy: use policy_node helper with MPOL_PREFERRED_MANY Patch series "mm: add new syscall set_mempolicy_home_node", v6. This patch (of 3): A followup patch will enable setting a home node with MPOL_PREFERRED_MANY memory policy. To facilitate that switch to using policy_node helper. There is no functional change in this patch. Link: https://lkml.kernel.org/r/20211202123810.267175-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20211202123810.267175-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ed7d15acb6a2..de70f119984a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2062,7 +2062,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2104,6 +2104,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2187,7 +2188,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), -- cgit v1.2.3 From c6018b4b254971863bd0ad36bb5e7d0fa0f0ddb0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 14 Jan 2022 14:08:17 -0800 Subject: mm/mempolicy: add set_mempolicy_home_node syscall This syscall can be used to set a home node for the MPOL_BIND and MPOL_PREFERRED_MANY memory policy. Users should use this syscall after setting up a memory policy for the specified range as shown below. mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node((unsigned long)p, nr_pages * page_size, home_node, 0); The syscall allows specifying a home node/preferred node from which kernel will fulfill memory allocation requests first. For address range with MPOL_BIND memory policy, if nodemask specifies more than one node, page allocations will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. For MPOL_PREFERRED_MANY if the nodemask specifies more than one node, page allocation will come from the node in the nodemask with sufficient free memory that is closest to the home node/preferred node. If there is not enough memory in all the nodes specified in the nodemask, the allocation will be attempted from the closest numa node to the home node in the system. This helps applications to hint at a memory allocation preference node and fallback to _only_ a set of nodes if the memory is not available on the preferred node. Fallback allocation is attempted from the node which is nearest to the preferred node. This helps applications to have control on memory allocation numa nodes and avoids default fallback to slow memory NUMA nodes. For example a system with NUMA nodes 1,2 and 3 with DRAM memory and 10, 11 and 12 of slow memory new_nodes = numa_bitmask_alloc(nr_nodes); numa_bitmask_setbit(new_nodes, 1); numa_bitmask_setbit(new_nodes, 2); numa_bitmask_setbit(new_nodes, 3); p = mmap(NULL, nr_pages * page_size, protflag, mapflag, -1, 0); mbind(p, nr_pages * page_size, MPOL_BIND, new_nodes->maskp, new_nodes->size + 1, 0); sys_set_mempolicy_home_node(p, nr_pages * page_size, 2, 0); This will allocate from nodes closer to node 2 and will make sure the kernel will only allocate from nodes 1, 2, and 3. Memory will not be allocated from slow memory nodes 10, 11, and 12. This differs from default MPOL_BIND behavior in that with default MPOL_BIND the allocation will be attempted from node closer to the local node. One of the reasons to specify a home node is to allow allocations from cpu less NUMA node and its nearby NUMA nodes. With MPOL_PREFERRED_MANY on the other hand will first try to allocate from the closest node to node 2 from the node list 1, 2 and 3. If those nodes don't have enough memory, kernel will allocate from slow memory node 10, 11 and 12 which ever is closer to node 2. Link: https://lkml.kernel.org/r/20211202123810.267175-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Ben Widawsky Cc: Dave Hansen Cc: Feng Tang Cc: Michal Hocko Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Mike Kravetz Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andi Kleen Cc: Dan Williams Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/mm/numa_memory_policy.rst | 16 ++++- include/linux/mempolicy.h | 1 + mm/mempolicy.c | 79 ++++++++++++++++++++++ 3 files changed, 95 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 64fd0ba0d057..5a6afecbb0d0 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -408,7 +408,7 @@ follows: Memory Policy APIs ================== -Linux supports 3 system calls for controlling memory policy. These APIS +Linux supports 4 system calls for controlling memory policy. These APIS always affect only the calling task, the calling task's address space, or some shared object mapped into the calling task's address space. @@ -460,6 +460,20 @@ requested via the 'flags' argument. See the mbind(2) man page for more details. +Set home node for a Range of Task's Address Spacec:: + + long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); + +sys_set_mempolicy_home_node set the home node for a VMA policy present in the +task's address range. The system call updates the home node only for the existing +mempolicy range. Other address ranges are ignored. A home node is the NUMA node +closest to which page allocation will come from. Specifying the home node override +the default allocation policy to allocate memory close to the local node for an +executing CPU. + + Memory Policy Command Line Interface ==================================== diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c7595e81150..668389b4b53d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -46,6 +46,7 @@ struct mempolicy { unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index de70f119984a..fc6cae7926f3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -296,6 +296,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -1478,6 +1479,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1802,6 +1874,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2344,6 +2421,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; -- cgit v1.2.3 From dad5b0232949818ae581ebd089c7013e2fdbb093 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 14 Jan 2022 14:08:24 -0800 Subject: mm/mempolicy: fix all kernel-doc warnings Fix kernel-doc warnings in mempolicy.c: mempolicy.c:139: warning: No description found for return value of 'numa_map_to_online_node' mempolicy.c:2165: warning: Excess function parameter 'node' description in 'alloc_pages_vma' mempolicy.c:2973: warning: No description found for return value of 'mpol_parse_str' Link: https://lkml.kernel.org/r/20211213233216.5477-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fc6cae7926f3..028e8dd82b44 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance */ int numa_map_to_online_node(int node) { @@ -2150,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * @order: Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. - * @node: Which node to prefer for allocation (modulo policy). * @hugepage: For hugepages try only the preferred node if possible. * * Allocate a page for a specific address in @vma, using the appropriate @@ -2966,7 +2967,7 @@ static const char * const policy_modes[] = * Format of input: * [=][:] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { -- cgit v1.2.3 From f530243a172d2ff03f88d0056f838928d6445c6d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 14 Jan 2022 14:08:27 -0800 Subject: mm, oom: OOM sysrq should always kill a process The OOM kill sysrq (alt+sysrq+F) should allow the user to kill the process with the highest OOM badness with a single execution. However, at the moment, the OOM kill can bail out if an OOM notifier (e.g. the i915 one) says that it reclaimed a tiny amount of memory from somewhere. That's probably not what the user wants, so skip the bailout if the OOM was triggered via sysrq. Link: https://lkml.kernel.org/r/20220106102605.635656-1-jannh@google.com Signed-off-by: Jann Horn Acked-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3390316c8a32..3934ff500878 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1058,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) + if (freed > 0 && !is_sysrq_oom(oc)) /* Got some memory back in the last second. */ return true; } -- cgit v1.2.3 From b5bade978e9b8f42521ccef711642bd21313cf44 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:34 -0800 Subject: mm: migrate: fix the return value of migrate_pages() Patch series "Improve the migration stats". According to talk with Zi Yan [1], this patch set changes the return value of migrate_pages() to avoid returning a number which is larger than the number of pages the users tried to migrate by move_pages() syscall. Also fix the hugetlb migration stats and migration stats in trace_mm_compaction_migratepages(). [1] https://lore.kernel.org/linux-mm/7E44019D-2A5D-4BA7-B4D5-00D4712F1687@nvidia.com/ This patch (of 3): As Zi Yan pointed out, the syscall move_pages() can return a non-migrated number larger than the number of pages the users tried to migrate, when a THP page is failed to migrate. This is confusing for users. Since other migration scenarios do not care about the actual non-migrated number of pages except the memory compaction migration which will fix in following patch. Thus we can change the return value to return the number of {normal page, THP, hugetlb} instead to avoid this issue, and the number of THP splits will be considered as the number of non-migrated THP, no matter how many subpages of the THP are migrated successfully. Meanwhile we should still keep the migration counters using the number of normal pages. Link: https://lkml.kernel.org/r/cover.1636275127.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/6486fabc3e8c66ff613e150af25e89b3147977a6.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Zi Yan Co-developed-by: Zi Yan Cc: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 63 +++++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 6aa4b5326784..57bc8b97490c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1421,7 +1421,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@ -1429,7 +1429,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP} that were not migrated, or an error code. + * The number of THP splits will be considered as the number of non-migrated THP, + * no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1438,6 +1440,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@ -1449,13 +1452,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; trace_mm_migrate_pages_start(mode, reason); if (!swapwrite) current->flags |= PF_SWAPWRITE; +thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@ -1504,18 +1510,20 @@ retry: case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } /* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; break; case -ENOMEM: /* @@ -1524,16 +1532,19 @@ retry: * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; goto out; case -EAGAIN: if (is_thp) { @@ -1559,17 +1570,37 @@ retry: */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages++; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@ -1578,11 +1609,11 @@ out: list_splice(&ret_pages, from); count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) -- cgit v1.2.3 From 5d39a7ebc8be70e30176aed6f98f799bfa7439d6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:37 -0800 Subject: mm: migrate: correct the hugetlb migration stats Correct the migration stats for hugetlb with using compound_nr() instead of thp_nr_pages(), meanwhile change 'nr_failed_pages' to record the number of normal pages failed to migrate, including THP and hugetlb, and 'nr_succeeded' will record the number of normal pages migrated successfully. [baolin.wang@linux.alibaba.com: fix docs, per Mike] Link: https://lkml.kernel.org/r/141bdfc6-f898-3cc3-f692-726c5f6cb74d@linux.alibaba.com Link: https://lkml.kernel.org/r/71a4b6c22f208728fe8c78ad26375436c4ff9704.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Zi Yan Cc: Steven Rostedt (VMware) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_migration.rst | 12 ++++++------ mm/migrate.c | 17 ++++++++--------- 2 files changed, 14 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 08810f549f70..8c5cb8147e55 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -263,15 +263,15 @@ Monitoring Migration The following events (counters) can be used to monitor page migration. 1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP page, then this counter is - increased by one. If the page was a THP, then this counter is increased by - the number of THP subpages. For example, migration of a single 2MB THP that - has 4KB-size base pages (subpages) will cause this counter to increase by - 512. + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. 2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP. + if it was a THP or hugetlb. 3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. diff --git a/mm/migrate.c b/mm/migrate.c index 57bc8b97490c..6f04aa2a3bd4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1429,9 +1429,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of {normal page, THP} that were not migrated, or an error code. - * The number of THP splits will be considered as the number of non-migrated THP, - * no matter how many subpages of the THP are migrated successfully. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1474,7 +1474,7 @@ retry: * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched(); if (PageHuge(page)) @@ -1523,7 +1523,7 @@ retry: /* Hugetlb migration is unsupported */ if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@ -1544,7 +1544,7 @@ retry: if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@ -1554,12 +1554,11 @@ retry: retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@ -1576,7 +1575,7 @@ retry: if (!no_subpage_counting) nr_failed++; - nr_failed_pages++; + nr_failed_pages += nr_subpages; break; } } -- cgit v1.2.3 From 84b328aa81216e08804d8875d63f26bda1298788 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:40 -0800 Subject: mm: compaction: fix the migration stats in trace_mm_compaction_migratepages() Now the migrate_pages() has changed to return the number of {normal page, THP, hugetlb} instead, thus we should not use the return value to calculate the number of pages migrated successfully. Instead we can just use the 'nr_succeeded' which indicates the number of normal pages migrated successfully to calculate the non-migrated pages in trace_mm_compaction_migratepages(). Link: https://lkml.kernel.org/r/b4225251c4bec068dcd90d275ab7de88a39e2bd7.1636275127.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Steven Rostedt (VMware) Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 24 ++++-------------------- mm/compaction.c | 7 ++++--- 2 files changed, 8 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 54e5bf081171..7d48e7079e48 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -68,10 +68,9 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, - int migrate_rc, - struct list_head *migratepages), + unsigned int nr_succeeded), - TP_ARGS(nr_all, migrate_rc, migratepages), + TP_ARGS(nr_all, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -79,23 +78,8 @@ TRACE_EVENT(mm_compaction_migratepages, ), TP_fast_assign( - unsigned long nr_failed = 0; - struct list_head *page_lru; - - /* - * migrate_pages() returns either a non-negative number - * with the number of pages that failed migration, or an - * error code, in which case we need to count the remaining - * pages manually - */ - if (migrate_rc >= 0) - nr_failed = migrate_rc; - else - list_for_each(page_lru, migratepages) - nr_failed++; - - __entry->nr_migrated = nr_all - nr_failed; - __entry->nr_failed = nr_failed; + __entry->nr_migrated = nr_succeeded; + __entry->nr_failed = nr_all - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", diff --git a/mm/compaction.c b/mm/compaction.c index 6e446094ce90..b4e94cda3019 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + unsigned int nr_succeeded = 0; /* * These counters track activities during zone compaction. Initialize @@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION, NULL); + MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc->nr_migratepages, err, - &cc->migratepages); + trace_mm_compaction_migratepages(cc->nr_migratepages, + nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; -- cgit v1.2.3 From ac16ec835314677dd7405dfb5a5e007c3ca424c7 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:43 -0800 Subject: mm: migrate: support multiple target nodes demotion We have some machines with multiple memory types like below, which have one fast (DRAM) memory node and two slow (persistent memory) memory nodes. According to current node demotion policy, if node 0 fills up, its memory should be migrated to node 1, when node 1 fills up, its memory will be migrated to node 2: node 0 -> node 1 -> node 2 ->stop. But this is not efficient and suitbale memory migration route for our machine with multiple slow memory nodes. Since the distance between node 0 to node 1 and node 0 to node 2 is equal, and memory migration between slow memory nodes will increase persistent memory bandwidth greatly, which will hurt the whole system's performance. Thus for this case, we can treat the slow memory node 1 and node 2 as a whole slow memory region, and we should migrate memory from node 0 to node 1 and node 2 if node 0 fills up. This patch changes the node_demotion data structure to support multiple target nodes, and establishes the migration path to support multiple target nodes with validating if the node distance is the best or not. available: 3 nodes (0-2) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 0 size: 62153 MB node 0 free: 55135 MB node 1 cpus: node 1 size: 127007 MB node 1 free: 126930 MB node 2 cpus: node 2 size: 126968 MB node 2 free: 126878 MB node distances: node 0 1 2 0: 10 20 20 1: 20 10 20 2: 20 20 10 Link: https://lkml.kernel.org/r/00728da107789bb4ed9e0d28b1d08fd8056af2ef.1636697263.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Cc: Dave Hansen Cc: Zi Yan Cc: Oscar Salvador Cc: Yang Shi Cc: Baolin Wang Cc: zhongjiang-ali Cc: Xunlei Pang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 129 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 6f04aa2a3bd4..9d2642a34018 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -1118,12 +1119,25 @@ out: * * This is represented in the node_demotion[] like this: * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate */ /* @@ -1134,8 +1148,20 @@ out: * must be held over all reads to ensure that no cycles are * observed. */ -static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; /** * next_demotion_node() - Get the next node in the demotion path @@ -1148,8 +1174,15 @@ static int node_demotion[MAX_NUMNODES] __read_mostly = */ int next_demotion_node(int node) { + struct demotion_nodes *nd; + unsigned short target_nr, index; int target; + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + /* * node_demotion[] is updated without excluding this * function from running. RCU doesn't provide any @@ -1160,9 +1193,28 @@ int next_demotion_node(int node) * node_demotion[] reads need to be consistent. */ rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); + target_nr = READ_ONCE(nd->nr); + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); return target; } @@ -3003,10 +3055,16 @@ EXPORT_SYMBOL(migrate_vma_finalize); /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; - for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + if (!node_demotion) + return; + + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } } static void disable_all_migrate_targets(void) @@ -3033,26 +3091,40 @@ static void disable_all_migrate_targets(void) * Failing here is OK. It might just indicate * being at the end of a chain. */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd; - /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE; + nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE; - node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + return NUMA_NO_NODE; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + return NUMA_NO_NODE; + + nd->nodes[index] = migration_target; + nd->nr++; return migration_target; } @@ -3068,7 +3140,9 @@ static int establish_migrate_target(int node, nodemask_t *used) * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@ -3080,7 +3154,7 @@ static void __set_migration_target_nodes(void) nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance; /* * Avoid any oddities like cycles that could occur @@ -3109,18 +3183,33 @@ again: * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets); - if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1; /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@ -3221,6 +3310,11 @@ static int __init migrate_on_reclaim_init(void) { int ret; + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", NULL, migration_offline_cpu); /* -- cgit v1.2.3 From 7813a1b5257b8eb2cb915cd08e7ba857070fdfd3 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:08:46 -0800 Subject: mm: migrate: add more comments for selecting target node randomly As Yang Shi suggested [1], it will be helpful to explain why we should select target node randomly now if there are multiple target nodes. [1] https://lore.kernel.org/all/CAHbLzkqSqCL+g7dfzeOw8fPyeEC0BBv13Ny1UVGHDkadnQdR=g@mail.gmail.com/ Link: https://lkml.kernel.org/r/c31d36bd097c6e9e69fc0f409c43b78e53e64fc2.1637766801.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: "Huang, Ying" Cc: Dave Hansen Cc: Zi Yan Cc: zhongjiang-ali Cc: Xunlei Pang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 9d2642a34018..f50087d3ebf2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1206,6 +1206,14 @@ int next_demotion_node(int node) /* * If there are multiple target nodes, just select one * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. */ index = get_random_int() % target_nr; break; -- cgit v1.2.3 From dcee9bf5bf2f59c173f3645ac2274595ac6c6aea Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:08:49 -0800 Subject: mm/migrate: move node demotion code to near its user Now, node_demotion and next_demotion_node() are placed between __unmap_and_move() and unmap_and_move(). This hurts code readability. So move them near their users in the file. There's no functionality change in this patch. Link: https://lkml.kernel.org/r/20211206031227.3323097-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Baolin Wang Reviewed-by: Yang Shi Reviewed-by: Wei Xu Cc: Dave Hansen Cc: Zi Yan Cc: Oscar Salvador Cc: Michal Hocko Cc: David Rientjes Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Thelen Cc: Keith Busch Cc: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 265 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 132 insertions(+), 133 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index f50087d3ebf2..e50b80534d80 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1093,139 +1093,6 @@ out: return rc; } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 - * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 - * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate - * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 - * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 - * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate - * - * Moreover some systems may have multiple slow memory nodes. - * Suppose a system has one socket with 3 memory nodes, node 0 - * is fast memory type, and node 1/2 both are slow memory - * type, and the distance between fast memory node and slow - * memory node is same. So the migration path should be: - * - * 0 -> 1/2 -> stop - * - * This is represented in the node_demotion[] like this: - * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 - * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate - * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -#define DEFAULT_DEMOTION_TARGET_NODES 15 - -#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES -#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) -#else -#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES -#endif - -struct demotion_nodes { - unsigned short nr; - short nodes[DEMOTION_TARGET_NODES]; -}; - -static struct demotion_nodes *node_demotion __read_mostly; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - struct demotion_nodes *nd; - unsigned short target_nr, index; - int target; - - if (!node_demotion) - return NUMA_NO_NODE; - - nd = &node_demotion[node]; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target_nr = READ_ONCE(nd->nr); - - switch (target_nr) { - case 0: - target = NUMA_NO_NODE; - goto out; - case 1: - index = 0; - break; - default: - /* - * If there are multiple target nodes, just select one - * target node randomly. - * - * In addition, we can also use round-robin to select - * target node, but we should introduce another variable - * for node_demotion[] to record last selected target node, - * that may cause cache ping-pong due to the changing of - * last target node. Or introducing per-cpu data to avoid - * caching issue, which seems more complicated. So selecting - * target node randomly seems better until now. - */ - index = get_random_int() % target_nr; - break; - } - - target = READ_ONCE(nd->nodes[index]); - -out: - rcu_read_unlock(); - return target; -} - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -3059,6 +2926,138 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); + + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); + return target; +} + #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) -- cgit v1.2.3 From f1e8db04b68cc56edc5baee5c7cb1f9b79c3da7e Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 14 Jan 2022 14:08:53 -0800 Subject: mm/migrate: remove redundant variables used in a for-loop The variable addr is being set and incremented in a for-loop but not actually being used. It is redundant and so addr and also variable start can be removed. Link: https://lkml.kernel.org/r/20211221185729.609630-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index e50b80534d80..05af2b2336b9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2481,8 +2481,7 @@ static bool migrate_vma_check_page(struct page *page) static void migrate_vma_unmap(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; + unsigned long i, restore = 0; bool allow_drain = true; lru_add_drain(); @@ -2528,7 +2527,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } } - for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) -- cgit v1.2.3 From e1c63e110f977205ab9dfb38989c54e6e7b52a7b Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Fri, 14 Jan 2022 14:08:59 -0800 Subject: mm: ksm: fix use-after-free kasan report in ksm_might_need_to_copy When under the stress of swapping in/out with KSM enabled, there is a low probability that kasan reports the BUG of use-after-free in ksm_might_need_to_copy() when do swap in. The freed object is the anon_vma got from page_anon_vma(page). It is because a swapcache page associated with one anon_vma now needed for another anon_vma, but the page's original vma was unmapped and the anon_vma was freed. In this case the if condition below always return false and then alloc a new page to copy. Swapin process then use the new page and can continue to run well, so this is harmless actually. } else if (anon_vma->root == vma->anon_vma->root && page->index == linear_page_index(vma, address)) { This patch exchange the order of above two judgment statement to avoid the kasan warning. Let cpu run "page->index == linear_page_index(vma, address)" firstly and return false basically to skip the read of anon_vma->root which may trigger the kasan use-after-free warning: ================================================================== BUG: KASAN: use-after-free in ksm_might_need_to_copy+0x12e/0x5b0 Read of size 8 at addr ffff88be9977dbd0 by task khugepaged/694 CPU: 8 PID: 694 Comm: khugepaged Kdump: loaded Tainted: G OE - 4.18.0.x86_64 Hardware name: 1288H V5/BC11SPSC0, BIOS 7.93 01/14/2021 Call Trace: dump_stack+0xf1/0x19b print_address_description+0x70/0x360 kasan_report+0x1b2/0x330 ksm_might_need_to_copy+0x12e/0x5b0 do_swap_page+0x452/0xe70 __collapse_huge_page_swapin+0x24b/0x720 khugepaged_scan_pmd+0xcae/0x1ff0 khugepaged+0x8ee/0xd70 kthread+0x1a2/0x1d0 ret_from_fork+0x1f/0x40 Allocated by task 2306153: kasan_kmalloc+0xa0/0xd0 kmem_cache_alloc+0xc0/0x1c0 anon_vma_clone+0xf7/0x380 anon_vma_fork+0xc0/0x390 copy_process+0x447b/0x4810 _do_fork+0x118/0x620 do_syscall_64+0x112/0x360 entry_SYSCALL_64_after_hwframe+0x65/0xca Freed by task 2306242: __kasan_slab_free+0x130/0x180 kmem_cache_free+0x78/0x1d0 unlink_anon_vmas+0x19c/0x4a0 free_pgtables+0x137/0x1b0 exit_mmap+0x133/0x320 mmput+0x15e/0x390 do_exit+0x8c5/0x1210 do_group_exit+0xb5/0x1b0 __x64_sys_exit_group+0x21/0x30 do_syscall_64+0x112/0x360 entry_SYSCALL_64_after_hwframe+0x65/0xca The buggy address belongs to the object at ffff88be9977dba0 which belongs to the cache anon_vma_chain of size 64 The buggy address is located 48 bytes inside of 64-byte region [ffff88be9977dba0, ffff88be9977dbe0) The buggy address belongs to the page: page:ffffea00fa65df40 count:1 mapcount:0 mapping:ffff888107717800 index:0x0 flags: 0x17ffffc0000100(slab) ================================================================== Link: https://lkml.kernel.org/r/20211202102940.1069634-1-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index f34476ac0a41..c20bd4d9a0d9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2576,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (!PageUptodate(page)) -- cgit v1.2.3 From 91d005479e06392617bacc114509d611b705eaac Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:02 -0800 Subject: mm/hwpoison: mf_mutex for soft offline and unpoison Patch series "mm/hwpoison: fix unpoison_memory()", v4. The main purpose of this series is to sync unpoison code to recent changes around how hwpoison code takes page refcount. Unpoison should work or simply fail (without crash) if impossible. The recent works of keeping hwpoison pages in shmem pagecache introduce a new state of hwpoisoned pages, but unpoison for such pages is not supported yet with this series. It seems that soft-offline and unpoison can be used as general purpose page offline/online mechanism (not in the context of memory error). I think that we need some additional works to realize it because currently soft-offline and unpoison are assumed not to happen so frequently (print out too many messages for aggressive usecases). But anyway this could be another interesting next topic. v1: https://lore.kernel.org/linux-mm/20210614021212.223326-1-nao.horiguchi@gmail.com/ v2: https://lore.kernel.org/linux-mm/20211025230503.2650970-1-naoya.horiguchi@linux.dev/ v3: https://lore.kernel.org/linux-mm/20211105055058.3152564-1-naoya.horiguchi@linux.dev/ This patch (of 3): Originally mf_mutex is introduced to serialize multiple MCE events, but it is not that useful to allow unpoison to run in parallel with memory_failure() and soft offline. So apply mf_mutex to soft offline and unpoison. The memory failure handler and soft offline handler get simpler with this. Link: https://lkml.kernel.org/r/20211115084006.3728254-1-naoya.horiguchi@linux.dev Link: https://lkml.kernel.org/r/20211115084006.3728254-2-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 62 ++++++++++++++++------------------------------------- 1 file changed, 18 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5f8ad5527506..607785491a52 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1502,14 +1502,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) lock_page(head); page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1623,6 +1615,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1649,7 +1643,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1783,16 +1776,6 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); @@ -1973,6 +1956,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + int ret = 0; unsigned long flags = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1983,39 +1967,30 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; - } - - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (!get_hwpoison_page(p, flags)) { @@ -2023,29 +1998,23 @@ int unpoison_memory(unsigned long pfn) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); num_poisoned_pages_dec(); freeit = 1; } - unlock_page(page); put_page(page); if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2226,9 +2195,12 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } @@ -2247,5 +2219,7 @@ retry: } } + mutex_unlock(&mf_mutex); + return ret; } -- cgit v1.2.3 From c9fdc4d5487a16bd1f003fc8b66e91f88efb50e6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:06 -0800 Subject: mm/hwpoison: remove MF_MSG_BUDDY_2ND and MF_MSG_POISONED_HUGE These action_page_types are no longer used, so remove them. Link: https://lkml.kernel.org/r/20211115084006.3728254-3-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Acked-by: Yang Shi Cc: "Aneesh Kumar K.V" Cc: David Hildenbrand Cc: Ding Hui Cc: Miaohe Lin Cc: Michal Hocko Cc: Oscar Salvador Cc: Peter Xu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- include/ras/ras_event.h | 2 -- mm/memory-failure.c | 2 -- 3 files changed, 6 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb67eb699b78..7f594da84aca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3201,7 +3201,6 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@ -3216,7 +3215,6 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 0bdbc0d17d2f..d0337a41141c 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -358,7 +358,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ EM ( MF_MSG_SLAB, "kernel slab page" ) \ EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ - EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ @@ -373,7 +372,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 607785491a52..810328fe8adb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -723,7 +723,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -738,7 +737,6 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", -- cgit v1.2.3 From bf181c582588f8f7406d52f2ee228539b465f173 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 14 Jan 2022 14:09:09 -0800 Subject: mm/hwpoison: fix unpoison_memory() After recent soft-offline rework, error pages can be taken off from buddy allocator, but the existing unpoison_memory() does not properly undo the operation. Moreover, due to the recent change on __get_hwpoison_page(), get_page_unless_zero() is hardly called for hwpoisoned pages. So __get_hwpoison_page() highly likely returns -EBUSY (meaning to fail to grab page refcount) and unpoison just clears PG_hwpoison without releasing a refcount. That does not lead to a critical issue like kernel panic, but unpoisoned pages never get back to buddy (leaked permanently), which is not good. To (partially) fix this, we need to identify "taken off" pages from other types of hwpoisoned pages. We can't use refcount or page flags for this purpose, so a pseudo flag is defined by hacking ->private field. Someone might think that put_page() is enough to cancel taken-off pages, but the normal free path contains some operations not suitable for the current purpose, and can fire VM_BUG_ON(). Note that unpoison_memory() is now supposed to be cancel hwpoison events injected only by madvise() or /sys/devices/system/memory/{hard,soft}_offline_page, not by MCE injection, so please don't try to use unpoison when testing with MCE injection. [lkp@intel.com: report build failure for ARCH=i386] Link: https://lkml.kernel.org/r/20211115084006.3728254-4-naoya.horiguchi@linux.dev Signed-off-by: Naoya Horiguchi Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Ding Hui Cc: Tony Luck Cc: "Aneesh Kumar K.V" Cc: Miaohe Lin Cc: Peter Xu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + include/linux/page-flags.h | 4 ++ mm/memory-failure.c | 109 +++++++++++++++++++++++++++++++++++++-------- mm/page_alloc.c | 27 +++++++++++ 4 files changed, 122 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7f594da84aca..d4fb49a5d60d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3174,6 +3174,7 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 18423c2157e8..7e2b90dc7d3f 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -522,7 +522,11 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +#define MAGIC_HWPOISON 0x48575053U /* HWPS */ +extern void SetPageHWPoisonTakenOff(struct page *page); +extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); +extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 810328fe8adb..6a2b4b86b679 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1160,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function @@ -1262,6 +1278,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1278,18 +1315,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1937,6 +1982,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1953,9 +2020,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - int ret = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1991,24 +2056,30 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); + if (PageSlab(page) || PageTable(page)) goto unlock_mutex; - } - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } unlock_mutex: mutex_unlock(&mf_mutex); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 99fc65c532f0..d4205e5e41d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -9508,6 +9509,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9519,6 +9521,31 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif #ifdef CONFIG_ZONE_DMA -- cgit v1.2.3 From 8c57c07741bf28e7d867f1200aa80120b8ca663e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 14 Jan 2022 14:09:12 -0800 Subject: mm: memcg/percpu: account extra objcg space to memory cgroups Similar to slab memory allocator, for each accounted percpu object there is an extra space which is used to store obj_cgroup membership. Charge it too. [akpm@linux-foundation.org: fix layout] Link: https://lkml.kernel.org/r/20211126040606.97836-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu-internal.h | 18 ++++++++++++++++++ mm/percpu.c | 10 +++++----- 2 files changed, 23 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 639662c20c82..411d1593ef23 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size; + + extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + + return size * num_possible_cpus() + extra_size; +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_PERCPU_STATS #include diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..4199a0604c32 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { obj_cgroup_put(objcg); return false; } @@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - size * num_possible_cpus()); + pcpu_obj_full_size(size)); rcu_read_unlock(); } else { - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); obj_cgroup_put(objcg); } } @@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - -(size * num_possible_cpus())); + -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); -- cgit v1.2.3 From 5ee2fa2f063649570c702164f47a558a3432dd9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 14 Jan 2022 14:09:16 -0800 Subject: mm/rmap: fix potential batched TLB flush race In theory, the following race is possible for batched TLB flushing. CPU0 CPU1 ---- ---- shrink_page_list() unmap zap_pte_range() flush_tlb_batched_pending() flush_tlb_mm() try_to_unmap() set_tlb_ubc_flush_pending() mm->tlb_flush_batched = true mm->tlb_flush_batched = false After the TLB is flushed on CPU1 via flush_tlb_mm() and before mm->tlb_flush_batched is set to false, some PTE is unmapped on CPU0 and the TLB flushing is pended. Then the pended TLB flushing will be lost. Although both set_tlb_ubc_flush_pending() and flush_tlb_batched_pending() are called with PTL locked, different PTL instances may be used. Because the race window is really small, and the lost TLB flushing will cause problem only if a TLB entry is inserted before the unmapping in the race window, the race is only theoretical. But the fix is simple and cheap too. Syzbot has reported this too as follows: ================================================================== BUG: KCSAN: data-race in flush_tlb_batched_pending / try_to_unmap_one write to 0xffff8881072cfbbc of 1 bytes by task 17406 on cpu 1: flush_tlb_batched_pending+0x5f/0x80 mm/rmap.c:691 madvise_free_pte_range+0xee/0x7d0 mm/madvise.c:594 walk_pmd_range mm/pagewalk.c:128 [inline] walk_pud_range mm/pagewalk.c:205 [inline] walk_p4d_range mm/pagewalk.c:240 [inline] walk_pgd_range mm/pagewalk.c:277 [inline] __walk_page_range+0x981/0x1160 mm/pagewalk.c:379 walk_page_range+0x131/0x300 mm/pagewalk.c:475 madvise_free_single_vma mm/madvise.c:734 [inline] madvise_dontneed_free mm/madvise.c:822 [inline] madvise_vma mm/madvise.c:996 [inline] do_madvise+0xe4a/0x1140 mm/madvise.c:1202 __do_sys_madvise mm/madvise.c:1228 [inline] __se_sys_madvise mm/madvise.c:1226 [inline] __x64_sys_madvise+0x5d/0x70 mm/madvise.c:1226 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae write to 0xffff8881072cfbbc of 1 bytes by task 71 on cpu 0: set_tlb_ubc_flush_pending mm/rmap.c:636 [inline] try_to_unmap_one+0x60e/0x1220 mm/rmap.c:1515 rmap_walk_anon+0x2fb/0x470 mm/rmap.c:2301 try_to_unmap+0xec/0x110 shrink_page_list+0xe91/0x2620 mm/vmscan.c:1719 shrink_inactive_list+0x3fb/0x730 mm/vmscan.c:2394 shrink_list mm/vmscan.c:2621 [inline] shrink_lruvec+0x3c9/0x710 mm/vmscan.c:2940 shrink_node_memcgs+0x23e/0x410 mm/vmscan.c:3129 shrink_node+0x8f6/0x1190 mm/vmscan.c:3252 kswapd_shrink_node mm/vmscan.c:4022 [inline] balance_pgdat+0x702/0xd30 mm/vmscan.c:4213 kswapd+0x200/0x340 mm/vmscan.c:4473 kthread+0x2c7/0x2e0 kernel/kthread.c:327 ret_from_fork+0x1f/0x30 value changed: 0x01 -> 0x00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 71 Comm: kswapd0 Not tainted 5.16.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 ================================================================== [akpm@linux-foundation.org: tweak comments] Link: https://lkml.kernel.org/r/20211201021104.126469-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reported-by: syzbot+aa5bebed695edaccf0df@syzkaller.appspotmail.com Cc: Nadav Amit Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Andy Lutomirski Cc: Dave Hansen Cc: Will Deacon Cc: Yu Zhao Cc: Marco Elver Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- mm/rmap.c | 43 ++++++++++++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6a89f128c990..e3b0476a4fda 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT diff --git a/mm/rmap.c b/mm/rmap.c index 163ac4e6bcee..6a1e8c7f6213 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; @@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) * before the PTE is cleared. */ barrier(); - mm->tlb_flush_batched = true; + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } /* * If the PTE was dirty then it's best to assume it's writable. The @@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (data_race(mm->tlb_flush_batched)) { - flush_tlb_mm(mm); + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + if (pending != flushed) { + flush_tlb_mm(mm); /* - * Do not allow the compiler to re-order the clearing of - * tlb_flush_batched before the tlb is flushed. + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. */ - barrier(); - mm->tlb_flush_batched = false; + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else -- cgit v1.2.3 From f44e1e697674335f837280d5e4485d1523206ea9 Mon Sep 17 00:00:00 2001 From: Zhaoyu Liu Date: Fri, 14 Jan 2022 14:09:19 -0800 Subject: zpool: remove the list of pools_head The list of pools_head is no longer needed because the caller has been deleted in commit 479305fd7172 ("zpool: remove zpool_evict()"). Link: https://lkml.kernel.org/r/20211215163727.GA17196@pc Signed-off-by: Zhaoyu Liu Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zpool.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'mm') diff --git a/mm/zpool.c b/mm/zpool.c index 6d9ed48141e5..68facc193496 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -24,16 +24,11 @@ struct zpool { const struct zpool_ops *ops; bool evictable; bool can_sleep_mapped; - - struct list_head list; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); -static LIST_HEAD(pools_head); -static DEFINE_SPINLOCK(pools_lock); - /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register @@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, pr_debug("created pool type %s\n", type); - spin_lock(&pools_lock); - list_add(&zpool->list, &pools_head); - spin_unlock(&pools_lock); - return zpool; } @@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); - spin_lock(&pools_lock); - list_del(&zpool->list); - spin_unlock(&pools_lock); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); -- cgit v1.2.3 From 0b8f0d870020dbd7037bfacbb73a9b3213470f90 Mon Sep 17 00:00:00 2001 From: Quanfa Fu Date: Fri, 14 Jan 2022 14:09:25 -0800 Subject: mm: fix some comment errors Link: https://lkml.kernel.org/r/20211101040208.460810-1-fuqf0919@gmail.com Signed-off-by: Quanfa Fu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 2 +- mm/memory-failure.c | 2 +- mm/slab_common.c | 2 +- mm/swap.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 02071f213c58..7af84bac6fc2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1303,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a2b4b86b679..373837bb94cb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1306,7 +1306,7 @@ static int __get_unpoison_page(struct page *page) * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, diff --git a/mm/slab_common.c b/mm/slab_common.c index 1f75bd4e95d6..9513244457e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -819,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void) if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) diff --git a/mm/swap.c b/mm/swap.c index e8c9dc6d0377..b461814ce0cb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -882,7 +882,7 @@ void lru_cache_disable(void) * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling + * requirements because that is enforced by the scheduling * guarantees. */ __lru_add_drain_all(true); -- cgit v1.2.3 From cab0a7c115546a4865fb7439558af9077a569574 Mon Sep 17 00:00:00 2001 From: Ting Liu Date: Fri, 14 Jan 2022 14:09:28 -0800 Subject: mm: make some vars and functions static or __init "page_idle_ops" as a global var, but its scope of use within this document. So it should be static. "page_ext_ops" is a var used in the kernel initial phase. And other functions are aslo used in the kernel initial phase. So they should be __init or __initdata to reclaim memory. Link: https://lkml.kernel.org/r/20211217095023.67293-1-liuting.0x7c00@bytedance.com Signed-off-by: Ting Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_idle.h | 1 - mm/page_ext.c | 4 ++-- mm/page_owner.c | 4 ++-- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 83abf95e9fa7..4663dfed1293 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,7 +13,6 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool folio_test_young(struct folio *folio) { diff --git a/mm/page_ext.c b/mm/page_ext.c index bee3240604dc..2e66d934d63f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -64,12 +64,12 @@ static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f924957ce7a..5eea061bb1e5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf) } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -75,7 +75,7 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; -- cgit v1.2.3 From 87c01d57fa23de82fff593a7d070933d08755801 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 14 Jan 2022 14:09:31 -0800 Subject: mm/hmm.c: allow VM_MIXEDMAP to work with hmm_range_fault hmm_range_fault() can be used instead of get_user_pages() for devices which allow faulting however unlike get_user_pages() it will return an error when used on a VM_MIXEDMAP range. To make hmm_range_fault() more closely match get_user_pages() remove this restriction. This requires dealing with the !ARCH_HAS_PTE_SPECIAL case in hmm_vma_handle_pte(). Rather than replicating the logic of vm_normal_page() call it directly and do a check for the zero pfn similar to what get_user_pages() currently does. Also add a test to hmm selftest to verify functionality. Link: https://lkml.kernel.org/r/20211104012001.2555676-1-apopple@nvidia.com Fixes: da4c3c735ea4 ("mm/hmm/mirror: helper to snapshot CPU page table") Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Jerome Glisse Cc: John Hubbard Cc: Zi Yan Cc: Ralph Campbell Cc: Felix Kuehling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hmm.c | 24 +++++++++++++++++++ mm/hmm.c | 5 ++-- tools/testing/selftests/vm/hmm-tests.c | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e2ce8f9b7605..767538089a62 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, return 0; } +static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long addr; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page *page; + int ret; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + + ret = vm_insert_page(vma, addr, page); + if (ret) { + __free_page(page); + return ret; + } + put_page(page); + } + + return 0; +} + static const struct file_operations dmirror_fops = { .open = dmirror_fops_open, .release = dmirror_fops_release, + .mmap = dmirror_fops_mmap, .unlocked_ioctl = dmirror_fops_unlocked_ioctl, .llseek = default_llseek, .owner = THIS_MODULE, diff --git a/mm/hmm.c b/mm/hmm.c index 842e26599238..bd56641c79d4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !pte_devmap(pte) && + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && vma->vm_flags & VM_READ) return 0; diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 864f126ffd78..203323967b50 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown) } } +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + /* * Test memory snapshot without faulting in pages accessed by the device. */ -- cgit v1.2.3 From b627b774911660852ce7f3f3817955ddad2bd130 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:34 -0800 Subject: mm/damon: unified access_check function naming rules Patch series "mm/damon: Do some small changes", v4. This patch (of 4): In damon/paddr.c file, two functions names start with underscore, static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, struct damon_region *r) static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, struct damon_region *r) In damon/vaddr.c file, there are also two functions with the same function, static void damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) static void damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) It makes sense to keep consistent, and it is not easy to be confused with the function that call them. Link: https://lkml.kernel.org/r/cover.1636989871.git.xhao@linux.alibaba.com Link: https://lkml.kernel.org/r/529054aed932a42b9c09fc9977ad4574b9e7b0bd.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 20a9a9d69eb1..73c5d1aafda6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -410,7 +410,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -429,7 +429,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -515,7 +515,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -551,7 +551,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); -- cgit v1.2.3 From d720bbbd70e968f8a0257393b575c3a29b56f990 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:40 -0800 Subject: mm/damon/core: use abs() instead of diff_of() In kernel, we can use abs(a - b) to get the absolute value, So there is no need to redefine a new one. Link: https://lkml.kernel.org/r/b24e7b82d9efa90daf150d62dea171e19390ad0b.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: Muchun Song Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index e92497895202..04b8df7fd9e9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -750,8 +750,6 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -765,13 +763,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { - if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else r->age++; if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else -- cgit v1.2.3 From cdeed009f3bceee41f73f0137db785fd29a05cb8 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:44 -0800 Subject: mm/damon: remove some unneeded function definitions in damon.h In damon.h some func definitions about VA & PA can only be used in its own file, so there no need to define in the header file, and the header file will look cleaner. If other files later need these functions, the prototypes can be added to damon.h at that time. [sj@kernel.org: remove unnecessary function prototype position changes] Link: https://lkml.kernel.org/r/20211118114827.20052-1-sj@kernel.org Link: https://lkml.kernel.org/r/45fd5b3ef6cce8e28dbc1c92f9dc845ccfc949d7.1636989871.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 21 --------------------- mm/damon/paddr.c | 11 ++++++----- mm/damon/vaddr.c | 18 ++++++++++-------- 3 files changed, 16 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d4be3cc987..1d1be348f506 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -461,34 +461,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); -int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR - -/* Monitoring primitives for the physical memory address space */ -void damon_pa_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); -int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a496d6f203d6..4318134cbc4c 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, damon_pa_mkold(r->sampling_addr); } -void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -213,7 +213,7 @@ bool damon_pa_target_valid(void *t) return true; } -int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, +static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { unsigned long addr; @@ -246,8 +246,9 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return 0; } -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 73c5d1aafda6..a9d3b4d96e29 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -272,7 +272,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -292,7 +292,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -356,7 +357,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -418,7 +419,7 @@ static void __damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -539,7 +540,7 @@ static void __damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -603,7 +604,7 @@ out: } #endif /* CONFIG_ADVISE_SYSCALLS */ -int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, +static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, struct damon_region *r, struct damos *scheme) { int madv_action; @@ -633,8 +634,9 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, return damos_madvise(t, r, madv_action); } -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { -- cgit v1.2.3 From 8bd0b9da03c9154e279b1a502636103887b9fbed Mon Sep 17 00:00:00 2001 From: Yihao Han Date: Fri, 14 Jan 2022 14:09:47 -0800 Subject: mm/damon/vaddr: remove swap_ranges() and replace it with swap() Remove 'swap_ranges()' and replace it with the macro 'swap()' defined in 'include/linux/minmax.h' to simplify code and improve efficiency Link: https://lkml.kernel.org/r/20211111115355.2808-1-hanyihao@vivo.com Signed-off-by: Yihao Han Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a9d3b4d96e29..78ff2bcb66eb 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -98,16 +98,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -146,9 +136,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -159,7 +149,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); -- cgit v1.2.3 From c89ae63eb0662b6c9f82dbfad3ef010239b8c1b1 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:50 -0800 Subject: mm/damon/schemes: add the validity judgment of thresholds In dbgfs "schemes" interface, i do some test like this: # cd /sys/kernel/debug/damon # echo "2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3" > schemes # cat schemes # 2 1 2 1 10 1 3 10 1 1 1 1 1 1 1 1 2 3 0 0 There have some unreasonable places, i set the valules of these variables " , , " as "<2, 1>, <2, 1>, <10, 1>, <1, 2, 3>. So there add a validity judgment for these thresholds value. Link: https://lkml.kernel.org/r/d78360e52158d786fcbf20bc62c96785742e76d3.1637239568.git.xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index ad65436756af..bf36a2756cfb 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -213,6 +213,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if (!damos_action_valid(action)) goto fail; + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, min_age, max_age, action, "a, &wmarks); -- cgit v1.2.3 From 9b2a38d6ef25c1748e3964b0ff30a89e4ed26583 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 14 Jan 2022 14:09:53 -0800 Subject: mm/damon: move damon_rand() definition into damon.h damon_rand() is called in three files:damon/core.c, damon/ paddr.c, damon/vaddr.c, i think there is no need to redefine this twice, So move it to damon.h will be a good choice. Link: https://lkml.kernel.org/r/20211202075859.51341-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 4 ++++ mm/damon/core.c | 4 ---- mm/damon/prmtv-common.h | 4 ---- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index 1d1be348f506..3e91a597a1aa 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,12 +11,16 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) +/* Get a random number in [l, r) */ +#define damon_rand(l, r) (l + prandom_u32_max(r - l)) + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). diff --git a/mm/damon/core.c b/mm/damon/core.c index 04b8df7fd9e9..61e844d15b13 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -23,9 +22,6 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h index 61f27037603e..e790cb5f8fe0 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/prmtv-common.h @@ -6,10 +6,6 @@ */ #include -#include - -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) struct page *damon_get_page(unsigned long pfn); -- cgit v1.2.3 From 88f86dcfa454784f7de550966c60fc78a3e95d6d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:09:59 -0800 Subject: mm/damon: convert macro functions to static inline functions Patch series "mm/damon: Misc cleanups". This patchset contains miscellaneous cleanups for DAMON's macro functions and documentation. This patch (of 6): This commit converts macro functions in DAMON to static inline functions, for better type checking, code documentation, etc[1]. [1] https://lore.kernel.org/linux-mm/20211202151213.6ec830863342220da4141bc5@linux-foundation.org/ Link: https://lkml.kernel.org/r/20211209131806.19317-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211209131806.19317-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 18 ++++++++++++------ mm/damon/core.c | 5 ++++- mm/damon/vaddr.c | 6 ++++-- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index e2c8152985b7..2dbc1f545da2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -399,14 +399,20 @@ struct damon_ctx { struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} -#define damon_last_region(t) \ - (list_last_entry(&t->regions_list, struct damon_region, list)) +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 61e844d15b13..4515cf82c433 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -729,7 +729,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 78ff2bcb66eb..68d9e4134816 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -26,8 +26,10 @@ * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task((struct pid *)t->id, PIDTYPE_PID); +} /* * Get the mm_struct of the given target -- cgit v1.2.3 From 0e92c2ee9f459542c5384d9cfab24873c3dd6398 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:17 -0800 Subject: mm/damon/schemes: account scheme actions that successfully applied Patch series "mm/damon/schemes: Extend stats for better online analysis and tuning". To help online access pattern analysis and tuning of DAMON-based Operation Schemes (DAMOS), DAMOS provides simple statistics for each scheme. Introduction of DAMOS time/space quota further made the tuning easier by making the risk management easier. However, that also made understanding of the working schemes a little bit more difficult. For an example, progress of a given scheme can now be throttled by not only the aggressiveness of the target access pattern, but also the time/space quotas. So, when a scheme is showing unexpectedly slow progress, it's difficult to know by what the progress of the scheme is throttled, with currently provided statistics. This patchset extends the statistics to contain some metrics that can be helpful for such online schemes analysis and tuning (patches 1-2), exports those to users (patches 3 and 5), and add documents (patches 4 and 6). This patch (of 6): DAMON-based operation schemes (DAMOS) stats provide only the number and the amount of regions that the action of the scheme has tried to be applied. Because the action could be failed for some reasons, the currently provided information is sometimes not useful or convenient enough for schemes profiling and tuning. To improve this situation, this commit extends the DAMOS stats to provide the number and the amount of regions that the action has successfully applied. Link: https://lkml.kernel.org/r/20211210150016.35349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211210150016.35349-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 28 +++++++++++++++++++++------- mm/damon/core.c | 13 ++++++++----- mm/damon/dbgfs.c | 2 +- mm/damon/paddr.c | 13 +++++++------ mm/damon/vaddr.c | 30 ++++++++++++++++-------------- 5 files changed, 53 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index 97f4a224e950..e0ad3d9aaeed 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -192,6 +192,20 @@ struct damos_watermarks { bool activated; }; +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -203,8 +217,7 @@ struct damos_watermarks { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. - * @stat_count: Total number of regions that this scheme is applied. - * @stat_sz: Total size of regions that this scheme is applied. + * @stat: Statistics of this scheme. * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the @@ -235,8 +248,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; - unsigned long stat_count; - unsigned long stat_sz; + struct damos_stat stat; struct list_head list; }; @@ -281,7 +293,8 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -295,8 +308,9 @@ struct damon_primitive { int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 4515cf82c433..d745bf28509f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -102,8 +102,7 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; - scheme->stat_count = 0; - scheme->stat_sz = 0; + scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); scheme->quota.ms = quota->ms; @@ -574,6 +573,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -627,7 +627,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->primitive.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -641,8 +641,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, r->age = 0; update_stat: - s->stat_count++; - s->stat_sz += sz; + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; } } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index bf36a2756cfb..9318b52d0b46 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -117,7 +117,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat_count, s->stat_sz); + s->stat.nr_tried, s->stat.sz_tried); if (!rc) return -ENOMEM; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 4318134cbc4c..5e8244f65a1a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t) return true; } -static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { - unsigned long addr; + unsigned long addr, applied; LIST_HEAD(page_list); if (scheme->action != DAMOS_PAGEOUT) - return -EINVAL; + return 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -241,9 +242,9 @@ static int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, put_page(page); } } - reclaim_pages(&page_list); + applied = reclaim_pages(&page_list); cond_resched(); - return 0; + return applied * PAGE_SIZE; } static int damon_pa_scheme_score(struct damon_ctx *context, diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 68d9e4134816..a10df3fd3d02 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -572,32 +572,34 @@ bool damon_va_target_valid(void *target) } #ifndef CONFIG_ADVISE_SYSCALLS -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { - return -EINVAL; + return 0; } #else -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { struct mm_struct *mm; - int ret = -ENOMEM; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; mm = damon_get_mm(target); if (!mm) - goto out; + return 0; - ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), - PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); -out: - return ret; + + return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ -static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { int madv_action; @@ -620,7 +622,7 @@ static int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_STAT: return 0; default: - return -EINVAL; + return 0; } return damos_madvise(t, r, madv_action); -- cgit v1.2.3 From 6268eac34ca30af7f6313504d556ec7fcd295621 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:20 -0800 Subject: mm/damon/schemes: account how many times quota limit has exceeded If the time/space quotas of a given DAMON-based operation scheme is too small, the scheme could show unexpectedly slow progress. However, there is no good way to notice the case in runtime. This commit extends the DAMOS stat to provide how many times the quota limits exceeded so that the users can easily notice the case and tune the scheme. Link: https://lkml.kernel.org/r/20211210150016.35349-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 2 ++ mm/damon/core.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index e0ad3d9aaeed..af648388e759 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -198,12 +198,14 @@ struct damos_watermarks { * @sz_tried: Total size of regions that the scheme is tried to be applied. * @nr_applied: Total number of regions that the scheme is applied. * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. */ struct damos_stat { unsigned long nr_tried; unsigned long sz_tried; unsigned long nr_applied; unsigned long sz_applied; + unsigned long qt_exceeds; }; /** diff --git a/mm/damon/core.c b/mm/damon/core.c index d745bf28509f..d5120b326e1b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -693,6 +693,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; -- cgit v1.2.3 From 60e52e7c46a127bca5ddd48b89002564f3862063 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:23 -0800 Subject: mm/damon/reclaim: provide reclamation statistics This implements new DAMON_RECLAIM parameters for statistics reporting. Those can be used for understanding how DAMON_RECLAIM is working, and for tuning the other parameters. Link: https://lkml.kernel.org/r/20211210150016.35349-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/reclaim.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'mm') diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index dc1485044eaf..bc476cef688e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + static struct damon_ctx *ctx; static struct damon_target *target; @@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + static int __init damon_reclaim_init(void) { ctx = damon_new_ctx(); @@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void) return -ENOMEM; damon_pa_set_primitives(ctx); + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; /* 4242 means nothing but fun */ target = damon_new_target(4242); -- cgit v1.2.3 From 3a619fdb8de8a3ecd4200e7d183d2c8ceb32289e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:29 -0800 Subject: mm/damon/dbgfs: support all DAMOS stats Currently, DAMON debugfs interface is not supporting DAMON-based Operation Schemes (DAMOS) stats for schemes successfully applied regions and time/space quota limit exceeds. This adds the support. Link: https://lkml.kernel.org/r/20211210150016.35349-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 9318b52d0b46..751c7b835684 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat.nr_tried, s->stat.sz_tried); + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); if (!rc) return -ENOMEM; -- cgit v1.2.3 From 49f4203aae06ba9d67b500c90339b262b0a52637 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 14 Jan 2022 14:10:35 -0800 Subject: mm/damon: add access checking for hugetlb pages The process's VMAs can be mapped by hugetlb page, but now the DAMON did not implement the access checking for hugetlb pte, so we can not get the actual access count like below if a process VMAs were mapped by hugetlb. damon_aggregated: target_id=18446614368406014464 nr_regions=12 4194304-5476352: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662370467840-140662372970496: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662372970496-140662375460864: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662375460864-140662377951232: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662377951232-140662380449792: 0 545 damon_aggregated: target_id=18446614368406014464 nr_regions=12 140662380449792-140662382944256: 0 545 ...... Thus this patch adds hugetlb access checking support, with this patch we can see below VMA mapped by hugetlb access count. damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296486649856-140296489914368: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296489914368-140296492978176: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296492978176-140296495439872: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296495439872-140296498311168: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296498311168-140296501198848: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296501198848-140296504320000: 1 3 damon_aggregated: target_id=18446613056935405824 nr_regions=12 140296504320000-140296507568128: 1 2 ...... [baolin.wang@linux.alibaba.com: fix unused var warning] Link: https://lkml.kernel.org/r/1aaf9c11-0d8e-b92d-5c92-46e50a6e8d4e@linux.alibaba.com [baolin.wang@linux.alibaba.com: v3] Link: https://lkml.kernel.org/r/486927ecaaaecf2e3a7fbe0378ec6e1c58b50747.1640852276.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/6afcbd1fda5f9c7c24f320d26a98188c727ceec3.1639623751.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Cc: Mike Kravetz Cc: Randy Dunlap Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index a10df3fd3d02..ee465b380612 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -388,8 +388,65 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + if (!page) + return; + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + huge_ptep_set_access_flags(vma, addr, pte, entry, + vma->vm_flags & VM_WRITE); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -484,8 +541,47 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + if (!page) + goto out; + + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.3 From 2cd4b8e10cc31eadb5b10b1d73b3f28156f3776c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 14 Jan 2022 14:10:38 -0800 Subject: mm/damon: move the implementation of damon_insert_region to damon.h Usually, inline function is declared static since it should sit between storage and type. And implement it in a header file if used by multiple files. And this change also fixes compile issue when backport damon to 5.10. mm/damon/vaddr.c: In function `damon_va_evenly_split_region': ./include/linux/damon.h:425:13: error: inlining failed in call to `always_inline' `damon_insert_region': function body not available 425 | inline void damon_insert_region(struct damon_region *r, | ^~~~~~~~~~~~~~~~~~~ mm/damon/vaddr.c:86:3: note: called from here 86 | damon_insert_region(n, r, next, t); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lkml.kernel.org/r/20211223085703.6142-1-guoqing.jiang@linux.dev Signed-off-by: Guoqing Jiang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/damon.h | 13 +++++++++++-- mm/damon/core.c | 11 ----------- 2 files changed, 11 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index af648388e759..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -451,9 +451,18 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); diff --git a/mm/damon/core.c b/mm/damon/core.c index d5120b326e1b..6482d510dcbe 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -49,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); -- cgit v1.2.3 From 70b8480812d0a3930049a44820a1fa149b090c10 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:41 -0800 Subject: mm/damon/dbgfs: remove an unnecessary variable Patch series "mm/damon: Hide unnecessary information disclosures". DAMON is exposing some unnecessary information including kernel pointer in kernel log and tracepoint. This patchset hides such information. The first patch is only for a trivial cleanup, though. This patch (of 4): This commit removes a unnecessarily used variable in dbgfs_target_ids_write(). Link: https://lkml.kernel.org/r/20211229131016.23641-1-sj@kernel.org Link: https://lkml.kernel.org/r/20211229131016.23641-2-sj@kernel.org Fixes: 4bc05954d007 ("mm/damon: implement a debugfs-based user space interface") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/dbgfs.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 751c7b835684..5b899601e56c 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -364,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, struct damon_ctx *ctx = file->private_data; struct damon_target *t, *next_t; bool id_is_pid = true; - char *kbuf, *nrs; + char *kbuf; unsigned long *targets; ssize_t nr_targets; ssize_t ret; @@ -374,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; /* target id is meaningless here, but we set it just for fun */ scnprintf(kbuf, count, "42 "); } - targets = str_to_target_ids(nrs, count, &nr_targets); + targets = str_to_target_ids(kbuf, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From 251403f19aab6a122f4dcfb14149814e85564202 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:44 -0800 Subject: mm/damon/vaddr: use pr_debug() for damon_va_three_regions() failure logging Failure of 'damon_va_three_regions()' is logged using 'pr_err()'. But, the function can fail in legal situations. To avoid making users be surprised and to keep the kernel clean, this makes the log to be printed using 'pr_debug()'. Link: https://lkml.kernel.org/r/20211229131016.23641-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ee465b380612..223829655d64 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -238,7 +238,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, int i; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + pr_debug("Failed to get three regions of target %lu\n", t->id); return; } -- cgit v1.2.3 From 962fe7a6b1b2f9deb1b31b3344afa3b11afdf7ab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:47 -0800 Subject: mm/damon/vaddr: hide kernel pointer from damon_va_three_regions() failure log The failure log message for 'damon_va_three_regions()' prints the target id, which is a 'struct pid' pointer in the case. To avoid exposing the kernel pointer via the log, this makes the log to use the index of the target in the context's targets list instead. Link: https://lkml.kernel.org/r/20211229131016.23641-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/damon/vaddr.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 223829655d64..89b6468da2b9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -232,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_debug("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } -- cgit v1.2.3 From 76fd0285b447991267e838842c0be7395eb454bb Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 14 Jan 2022 14:10:50 -0800 Subject: mm/damon: hide kernel pointer from tracepoint event DAMON's virtual address spaces monitoring primitive uses 'struct pid *' of the target process as its monitoring target id. The kernel address is exposed as-is to the user space via the DAMON tracepoint, 'damon_aggregated'. Though primarily only privileged users are allowed to access that, it would be better to avoid unnecessarily exposing kernel pointers so. Because the trace result is only required to be able to distinguish each target, we aren't need to use the pointer as-is. This makes the tracepoint to use the index of the target in the context's targets list as its id in the tracepoint, to hide the kernel space address. Link: https://lkml.kernel.org/r/20211229131016.23641-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/damon.h | 8 ++++---- mm/damon/core.c | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 99ffa601e351..c79f1d4c39af 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -11,10 +11,10 @@ TRACE_EVENT(damon_aggregated, - TP_PROTO(struct damon_target *t, struct damon_region *r, - unsigned int nr_regions), + TP_PROTO(struct damon_target *t, unsigned int target_id, + struct damon_region *r, unsigned int nr_regions), - TP_ARGS(t, r, nr_regions), + TP_ARGS(t, target_id, r, nr_regions), TP_STRUCT__entry( __field(unsigned long, target_id) @@ -26,7 +26,7 @@ TRACE_EVENT(damon_aggregated, ), TP_fast_assign( - __entry->target_id = t->id; + __entry->target_id = target_id; __entry->nr_regions = nr_regions; __entry->start = r->ar.start; __entry->end = r->ar.end; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6482d510dcbe..1dd153c31c9e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -514,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } -- cgit v1.2.3