From 46a9ea6681907a3be6b6b0d43776dccc62cad6cf Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Fri, 8 Sep 2023 19:06:49 -0400 Subject: mm/slab_common: fix slab_caches list corruption after kmem_cache_destroy() After the commit in Fixes:, if a module that created a slab cache does not release all of its allocated objects before destroying the cache (at rmmod time), we might end up releasing the kmem_cache object without removing it from the slab_caches list thus corrupting the list as kmem_cache_destroy() ignores the return value from shutdown_cache(), which in turn never removes the kmem_cache object from slabs_list in case __kmem_cache_shutdown() fails to release all of the cache's slabs. This is easily observable on a kernel built with CONFIG_DEBUG_LIST=y as after that ill release the system will immediately trip on list_add, or list_del, assertions similar to the one shown below as soon as another kmem_cache gets created, or destroyed: [ 1041.213632] list_del corruption. next->prev should be ffff89f596fb5768, but was 52f1e5016aeee75d. (next=ffff89f595a1b268) [ 1041.219165] ------------[ cut here ]------------ [ 1041.221517] kernel BUG at lib/list_debug.c:62! [ 1041.223452] invalid opcode: 0000 [#1] PREEMPT SMP PTI [ 1041.225408] CPU: 2 PID: 1852 Comm: rmmod Kdump: loaded Tainted: G B W OE 6.5.0 #15 [ 1041.228244] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230524-3.fc37 05/24/2023 [ 1041.231212] RIP: 0010:__list_del_entry_valid+0xae/0xb0 Another quick way to trigger this issue, in a kernel with CONFIG_SLUB=y, is to set slub_debug to poison the released objects and then just run cat /proc/slabinfo after removing the module that leaks slab objects, in which case the kernel will panic: [ 50.954843] general protection fault, probably for non-canonical address 0xa56b6b6b6b6b6b8b: 0000 [#1] PREEMPT SMP PTI [ 50.961545] CPU: 2 PID: 1495 Comm: cat Kdump: loaded Tainted: G B W OE 6.5.0 #15 [ 50.966808] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230524-3.fc37 05/24/2023 [ 50.972663] RIP: 0010:get_slabinfo+0x42/0xf0 This patch fixes this issue by properly checking shutdown_cache()'s return value before taking the kmem_cache_release() branch. Fixes: 0495e337b703 ("mm/slab_common: Deleting kobject in kmem_cache_destroy() without holding slab_mutex/cpu_hotplug_lock") Signed-off-by: Rafael Aquini Cc: stable@vger.kernel.org Reviewed-by: Waiman Long Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 01cdbf122463..e99e821065c3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -479,7 +479,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { - int refcnt; + int err = -EBUSY; bool rcu_set; if (unlikely(!s) || !kasan_check_byte(s)) @@ -490,17 +490,17 @@ void kmem_cache_destroy(struct kmem_cache *s) rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; - refcnt = --s->refcount; - if (refcnt) + s->refcount--; + if (s->refcount) goto out_unlock; - WARN(shutdown_cache(s), - "%s %s: Slab cache still has objects when called from %pS", + err = shutdown_cache(s); + WARN(err, "%s %s: Slab cache still has objects when called from %pS", __func__, s->name, (void *)_RET_IP_); out_unlock: mutex_unlock(&slab_mutex); cpus_read_unlock(); - if (!refcnt && !rcu_set) + if (!err && !rcu_set) kmem_cache_release(s); } EXPORT_SYMBOL(kmem_cache_destroy); -- cgit v1.2.3 From 3cec50490969afd4a76ccee441f747d869ccff77 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Sep 2023 12:31:42 -0700 Subject: vm: fix move_vma() memory accounting being off Commit 408579cd627a ("mm: Update do_vmi_align_munmap() return semantics") seems to have updated one of the callers of do_vmi_munmap() incorrectly: it used to check for the error case (which didn't change: negative means error). That commit changed the check to the success case (which did change: before that commit, 0 was success, and 1 was "success and lock downgraded". After the change, it's always 0 for success, and the lock will have been released if requested). This didn't change any actual VM behavior _except_ for memory accounting when 'VM_ACCOUNT' was set on the vma. Which made the wrong return value test fairly subtle, since everything continues to work. Or rather - it continues to work but the "Committed memory" accounting goes all wonky (Committed_AS value in /proc/meminfo), and depending on settings that then causes problems much much later as the VM relies on bogus statistics for its heuristics. Revert that one line of the change back to the original logic. Fixes: 408579cd627a ("mm: Update do_vmi_align_munmap() return semantics") Reported-by: Christoph Biedl Reported-bisected-and-tested-by: Michael Labiuk Cc: Bagas Sanjaya Cc: Liam R. Howlett Link: https://lore.kernel.org/all/1694366957@msgid.manchmal.in-ulm.de/ Signed-off-by: Linus Torvalds --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 056478c106ee..382e81c33fc4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -715,7 +715,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } vma_iter_init(&vmi, mm, old_addr); - if (!do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false)) { + if (do_vmi_munmap(&vmi, mm, old_addr, old_len, uf_unmap, false) < 0) { /* OOM: unable to split vma, just get accounts right */ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) vm_acct_memory(old_len >> PAGE_SHIFT); -- cgit v1.2.3 From 7b086755fb8cdbb6b3e45a1bbddc00e7f9b1dc03 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 11 Sep 2023 14:11:08 -0400 Subject: mm: page_alloc: fix CMA and HIGHATOMIC landing on the wrong buddy list Commit 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") bypasses the pcplist on lock contention and returns the page directly to the buddy list of the page's migratetype. For pages that don't have their own pcplist, such as CMA and HIGHATOMIC, the migratetype is temporarily updated such that the page can hitch a ride on the MOVABLE pcplist. Their true type is later reassessed when flushing in free_pcppages_bulk(). However, when lock contention is detected after the type was already overridden, the bypass will then put the page on the wrong buddy list. Once on the MOVABLE buddy list, the page becomes eligible for fallbacks and even stealing. In the case of HIGHATOMIC, otherwise ineligible allocations can dip into the highatomic reserves. In the case of CMA, the page can be lost from the CMA region permanently. Use a separate pcpmigratetype variable for the pcplist override. Use the original migratetype when going directly to the buddy. This fixes the bug and should make the intentions more obvious in the code. Originally sent here to address the HIGHATOMIC case: https://lore.kernel.org/lkml/20230821183733.106619-4-hannes@cmpxchg.org/ Changelog updated in response to the CMA-specific bug report. [mgorman@techsingularity.net: updated changelog] Link: https://lkml.kernel.org/r/20230911181108.GA104295@cmpxchg.org Fixes: 4b23a68f9536 ("mm/page_alloc: protect PCP lists with a spinlock") Signed-off-by: Johannes Weiner Reported-by: Joe Liu Reviewed-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0c5be12f9336..95546f376302 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2400,7 +2400,7 @@ void free_unref_page(struct page *page, unsigned int order) struct per_cpu_pages *pcp; struct zone *zone; unsigned long pfn = page_to_pfn(page); - int migratetype; + int migratetype, pcpmigratetype; if (!free_unref_page_prepare(page, pfn, order)) return; @@ -2408,24 +2408,24 @@ void free_unref_page(struct page *page, unsigned int order) /* * We only track unmovable, reclaimable and movable on pcp lists. * Place ISOLATE pages on the isolated list because they are being - * offlined but treat HIGHATOMIC as movable pages so we can get those - * areas back if necessary. Otherwise, we may have to free + * offlined but treat HIGHATOMIC and CMA as movable pages so we can + * get those areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ - migratetype = get_pcppage_migratetype(page); + migratetype = pcpmigratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE); return; } - migratetype = MIGRATE_MOVABLE; + pcpmigratetype = MIGRATE_MOVABLE; } zone = page_zone(page); pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { - free_unref_page_commit(zone, pcp, page, migratetype, order); + free_unref_page_commit(zone, pcp, page, pcpmigratetype, order); pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); -- cgit v1.2.3 From 9ea9cb00a82b53ec39630eac718776d37e41b35a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 14 Sep 2023 11:21:39 -0400 Subject: mm: memcontrol: fix GFP_NOFS recursion in memory.high enforcement Breno and Josef report a deadlock scenario from cgroup reclaim re-entering the filesystem: [ 361.546690] ====================================================== [ 361.559210] WARNING: possible circular locking dependency detected [ 361.571703] 6.5.0-0_fbk700_debug_rc0_kbuilder_13159_gbf787a128001 #1 Tainted: G S E [ 361.589704] ------------------------------------------------------ [ 361.602277] find/9315 is trying to acquire lock: [ 361.611625] ffff88837ba140c0 (&delayed_node->mutex){+.+.}-{4:4}, at: __btrfs_release_delayed_node+0x68/0x4f0 [ 361.631437] [ 361.631437] but task is already holding lock: [ 361.643243] ffff8881765b8678 (btrfs-tree-01){++++}-{4:4}, at: btrfs_tree_read_lock+0x1e/0x40 [ 362.904457] mutex_lock_nested+0x1c/0x30 [ 362.912414] __btrfs_release_delayed_node+0x68/0x4f0 [ 362.922460] btrfs_evict_inode+0x301/0x770 [ 362.982726] evict+0x17c/0x380 [ 362.988944] prune_icache_sb+0x100/0x1d0 [ 363.005559] super_cache_scan+0x1f8/0x260 [ 363.013695] do_shrink_slab+0x2a2/0x540 [ 363.021489] shrink_slab_memcg+0x237/0x3d0 [ 363.050606] shrink_slab+0xa7/0x240 [ 363.083382] shrink_node_memcgs+0x262/0x3b0 [ 363.091870] shrink_node+0x1a4/0x720 [ 363.099150] shrink_zones+0x1f6/0x5d0 [ 363.148798] do_try_to_free_pages+0x19b/0x5e0 [ 363.157633] try_to_free_mem_cgroup_pages+0x266/0x370 [ 363.190575] reclaim_high+0x16f/0x1f0 [ 363.208409] mem_cgroup_handle_over_high+0x10b/0x270 [ 363.246678] try_charge_memcg+0xaf2/0xc70 [ 363.304151] charge_memcg+0xf0/0x350 [ 363.320070] __mem_cgroup_charge+0x28/0x40 [ 363.328371] __filemap_add_folio+0x870/0xd50 [ 363.371303] filemap_add_folio+0xdd/0x310 [ 363.399696] __filemap_get_folio+0x2fc/0x7d0 [ 363.419086] pagecache_get_page+0xe/0x30 [ 363.427048] alloc_extent_buffer+0x1cd/0x6a0 [ 363.435704] read_tree_block+0x43/0xc0 [ 363.443316] read_block_for_search+0x361/0x510 [ 363.466690] btrfs_search_slot+0xc8c/0x1520 This is caused by the mem_cgroup_handle_over_high() not respecting the gfp_mask of the allocation context. We used to only call this function on resume to userspace, where no locks were held. But c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") added a call from the allocation context without considering the gfp. Link: https://lkml.kernel.org/r/20230914152139.100822-1-hannes@cmpxchg.org Fixes: c9afe31ec443 ("memcg: synchronously enforce memory.high for large overcharges") Signed-off-by: Johannes Weiner Reported-by: Breno Leitao Reported-by: Josef Bacik Acked-by: Shakeel Butt Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Muchun Song Cc: [5.17+] Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- include/linux/resume_user_mode.h | 2 +- mm/memcontrol.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ab94ad4597d0..e4e24da16d2c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -920,7 +920,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } -void mem_cgroup_handle_over_high(void); +void mem_cgroup_handle_over_high(gfp_t gfp_mask); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); @@ -1458,7 +1458,7 @@ static inline void mem_cgroup_unlock_pages(void) rcu_read_unlock(); } -static inline void mem_cgroup_handle_over_high(void) +static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask) { } diff --git a/include/linux/resume_user_mode.h b/include/linux/resume_user_mode.h index 285189454449..f8f3e958e9cf 100644 --- a/include/linux/resume_user_mode.h +++ b/include/linux/resume_user_mode.h @@ -55,7 +55,7 @@ static inline void resume_user_mode_work(struct pt_regs *regs) } #endif - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(GFP_KERNEL); blkcg_maybe_throttle_current(); rseq_handle_notify_resume(NULL, regs); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4d3282493b6..d13dde2f8b56 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2555,7 +2555,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, * Scheduled by try_charge() to be executed from the userland return path * and reclaims memory over the high limit. */ -void mem_cgroup_handle_over_high(void) +void mem_cgroup_handle_over_high(gfp_t gfp_mask) { unsigned long penalty_jiffies; unsigned long pflags; @@ -2583,7 +2583,7 @@ retry_reclaim: */ nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); + gfp_mask); /* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -2819,7 +2819,7 @@ done_restock: if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH && !(current->flags & PF_MEMALLOC) && gfpflags_allow_blocking(gfp_mask)) { - mem_cgroup_handle_over_high(); + mem_cgroup_handle_over_high(gfp_mask); } return 0; } -- cgit v1.2.3 From c8be03806738c86521dbf1e0503bc90855fb99a3 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Thu, 14 Sep 2023 21:47:41 +0800 Subject: filemap: add filemap_map_order0_folio() to handle order0 folio Kernel test robot reported regressions for several benchmarks [1]. The regression are related with commit: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") It turned out that function filemap_map_folio_range() brings these regressions when handle folio with order0. Add filemap_map_order0_folio() to handle order0 folio. The benefit come from two perspectives: - the code size is smaller (around 126 bytes) - no loop Testing showed the regressions reported by 0day [1] all are fixed: commit 9f1f5b60e76d44fa: parent commit of de74976eb65151a2 commit fbdf9263a3d7fdbd: latest mm-unstable commit commit 7fbfe2003f84686d: this fixing patch 9f1f5b60e76d44fa fbdf9263a3d7fdbd 7fbfe2003f84686d ---------------- --------------------------- --------------------------- 3843810 -21.4% 3020268 +4.6% 4018708 stress-ng.bad-altstack.ops 64061 -21.4% 50336 +4.6% 66977 stress-ng.bad-altstack.ops_per_sec 1709026 -14.4% 1462102 +2.4% 1750757 stress-ng.fork.ops 28483 -14.4% 24368 +2.4% 29179 stress-ng.fork.ops_per_sec 3685088 -53.6% 1710976 +0.5% 3702454 stress-ng.zombie.ops 56732 -65.3% 19667 +0.7% 57107 stress-ng.zombie.ops_per_sec 61874 -12.1% 54416 +0.4% 62136 vm-scalability.median 13527663 -11.7% 11942117 -0.1% 13513946 vm-scalability.throughput 4.066e+09 -11.7% 3.59e+09 -0.1% 4.061e+09 vm-scalability.workload [1]: https://lore.kernel.org/oe-lkp/72e017b9-deb6-44fa-91d6-716ee2c39cbc@intel.com/T/#m7d2bba30f75a9cee8eab07e5809abd9b3b206c84 Link: https://lkml.kernel.org/r/20230914134741.1937654-1-fengwei.yin@intel.com Fixes: de74976eb65151a2f568e477fc2e0032df5b22b4 ("filemap: add filemap_map_folio_range()") Signed-off-by: Yin Fengwei Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202309111556.b2aa3d7a-oliver.sang@intel.com Cc: Feng Tang Cc: Huang Ying Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 69 ++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 582f5317ff71..4ea4387053e8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3475,13 +3475,11 @@ skip: */ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, - unsigned long addr, unsigned int nr_pages) + unsigned long addr, unsigned int nr_pages, + unsigned int *mmap_miss) { vm_fault_t ret = 0; - struct vm_area_struct *vma = vmf->vma; - struct file *file = vma->vm_file; struct page *page = folio_page(folio, start); - unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); unsigned int count = 0; pte_t *old_ptep = vmf->pte; @@ -3489,8 +3487,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, if (PageHWPoison(page + count)) goto skip; - if (mmap_miss > 0) - mmap_miss--; + (*mmap_miss)++; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3525,7 +3522,35 @@ skip: } vmf->pte = old_ptep; - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + + return ret; +} + +static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf, + struct folio *folio, unsigned long addr, + unsigned int *mmap_miss) +{ + vm_fault_t ret = 0; + struct page *page = &folio->page; + + if (PageHWPoison(page)) + return ret; + + (*mmap_miss)++; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit + * the fault-around logic. + */ + if (!pte_none(ptep_get(vmf->pte))) + return ret; + + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + set_pte_range(vmf, folio, page, 1, addr); + folio_ref_inc(folio); return ret; } @@ -3541,7 +3566,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; vm_fault_t ret = 0; - int nr_pages = 0; + unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved; rcu_read_lock(); folio = next_uptodate_folio(&xas, mapping, end_pgoff); @@ -3569,25 +3594,27 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, end = folio->index + folio_nr_pages(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; - /* - * NOTE: If there're PTE markers, we'll leave them to be - * handled in the specific fault path, and it'll prohibit the - * fault-around logic. - */ - if (!pte_none(ptep_get(vmf->pte))) - goto unlock; - - ret |= filemap_map_folio_range(vmf, folio, - xas.xa_index - folio->index, addr, nr_pages); + if (!folio_test_large(folio)) + ret |= filemap_map_order0_folio(vmf, + folio, addr, &mmap_miss); + else + ret |= filemap_map_folio_range(vmf, folio, + xas.xa_index - folio->index, addr, + nr_pages, &mmap_miss); -unlock: folio_unlock(folio); folio_put(folio); - folio = next_uptodate_folio(&xas, mapping, end_pgoff); - } while (folio); + } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); + + mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss); + if (mmap_miss >= mmap_miss_saved) + WRITE_ONCE(file->f_ra.mmap_miss, 0); + else + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); -- cgit v1.2.3 From 2a86f1b56a30e242caf7ee1268af68f4f49ce847 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 20 Sep 2023 14:26:29 +0800 Subject: kasan: Cleanup the __HAVE_ARCH_SHADOW_MAP usage As Linus suggested, __HAVE_ARCH_XYZ is "stupid" and "having historical uses of it doesn't make it good". So migrate __HAVE_ARCH_SHADOW_MAP to separate macros named after the respective functions. Suggested-by: Linus Torvalds Reviewed-by: WANG Xuerui Reviewed-by: Andrey Konovalov Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/kasan.h | 10 ++++++++-- include/linux/kasan.h | 2 +- mm/kasan/kasan.h | 8 +++----- 3 files changed, 12 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/arch/loongarch/include/asm/kasan.h b/arch/loongarch/include/asm/kasan.h index deeff8158f45..a12ecab37da7 100644 --- a/arch/loongarch/include/asm/kasan.h +++ b/arch/loongarch/include/asm/kasan.h @@ -10,8 +10,6 @@ #include #include -#define __HAVE_ARCH_SHADOW_MAP - #define KASAN_SHADOW_SCALE_SHIFT 3 #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) @@ -68,6 +66,7 @@ static __always_inline bool kasan_arch_is_ready(void) return !kasan_early_stage; } +#define kasan_mem_to_shadow kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { if (!kasan_arch_is_ready()) { @@ -97,6 +96,7 @@ static inline void *kasan_mem_to_shadow(const void *addr) } } +#define kasan_shadow_to_mem kasan_shadow_to_mem static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { unsigned long addr = (unsigned long)shadow_addr; @@ -119,6 +119,12 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) } } +#define addr_has_metadata addr_has_metadata +static __always_inline bool addr_has_metadata(const void *addr) +{ + return (kasan_mem_to_shadow((void *)addr) != NULL); +} + void kasan_init(void); asmlinkage void kasan_early_init(void); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 3df5499f7936..842623d708c2 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -54,7 +54,7 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); -#ifndef __HAVE_ARCH_SHADOW_MAP +#ifndef kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index f70e3d7a602e..d37831b8511c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,7 +291,7 @@ struct kasan_stack_ring { #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) -#ifndef __HAVE_ARCH_SHADOW_MAP +#ifndef kasan_shadow_to_mem static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) @@ -299,15 +299,13 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) } #endif +#ifndef addr_has_metadata static __always_inline bool addr_has_metadata(const void *addr) { -#ifdef __HAVE_ARCH_SHADOW_MAP - return (kasan_mem_to_shadow((void *)addr) != NULL); -#else return (kasan_reset_tag(addr) >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); -#endif } +#endif /** * kasan_check_range - Check memory region, and report if invalid access. -- cgit v1.2.3 From 8446a4deb6b6bc998f1d8d2a85d1a0c64b9e3a71 Mon Sep 17 00:00:00 2001 From: David Laight Date: Thu, 7 Sep 2023 12:42:20 +0000 Subject: slab: kmalloc_size_roundup() must not return 0 for non-zero size The typical use of kmalloc_size_roundup() is: ptr = kmalloc(sz = kmalloc_size_roundup(size), ...); if (!ptr) return -ENOMEM. This means it is vitally important that the returned value isn't less than the argument even if the argument is insane. In particular if kmalloc_slab() fails or the value is above (MAX_ULONG - PAGE_SIZE) zero is returned and kmalloc() will return its single zero-length buffer ZERO_SIZE_PTR. Fix this by returning the input size if the size exceeds KMALLOC_MAX_SIZE. kmalloc() will then return NULL as the size really is too big. kmalloc_slab() should not normally return NULL, unless called too early. Again, returning zero is not the correct action as it can be in some usage scenarios stored to a variable and only later cause kmalloc() return ZERO_SIZE_PTR and subsequent crashes on access. Instead we can simply stop checking the kmalloc_slab() result completely, as calling kmalloc_size_roundup() too early would then result in an immediate crash during boot and the developer noticing an issue in their code. [vbabka@suse.cz: remove kmalloc_slab() result check, tweak comments and commit log] Fixes: 05a940656e1e ("slab: Introduce kmalloc_size_roundup()") Signed-off-by: David Laight Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index e99e821065c3..306e6f0074ff 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -745,24 +745,24 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) size_t kmalloc_size_roundup(size_t size) { - struct kmem_cache *c; + if (size && size <= KMALLOC_MAX_CACHE_SIZE) { + /* + * The flags don't matter since size_index is common to all. + * Neither does the caller for just getting ->object_size. + */ + return kmalloc_slab(size, GFP_KERNEL, 0)->object_size; + } - /* Short-circuit the 0 size case. */ - if (unlikely(size == 0)) - return 0; - /* Short-circuit saturated "too-large" case. */ - if (unlikely(size == SIZE_MAX)) - return SIZE_MAX; /* Above the smaller buckets, size is a multiple of page size. */ - if (size > KMALLOC_MAX_CACHE_SIZE) + if (size && size <= KMALLOC_MAX_SIZE) return PAGE_SIZE << get_order(size); /* - * The flags don't matter since size_index is common to all. - * Neither does the caller for just getting ->object_size. + * Return 'size' for 0 - kmalloc() returns ZERO_SIZE_PTR + * and very large size - kmalloc() may fail. */ - c = kmalloc_slab(size, GFP_KERNEL, 0); - return c ? c->object_size : 0; + return size; + } EXPORT_SYMBOL(kmalloc_size_roundup); -- cgit v1.2.3 From db58b5eea8a47da3bed6b128dfcbdaf336c6a244 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 20 Sep 2023 16:40:22 +0200 Subject: Revert "tmpfs: add support for multigrain timestamps" This reverts commit d48c3397291690c3576d6c983b0a86ecbc203cac. Users reported regressions due to enabling multi-grained timestamps unconditionally. As no clear consensus on a solution has come up and the discussion has gone back to the drawing board revert the infrastructure changes for. If it isn't code that's here to stay, make it go away. Message-ID: <20230920-keine-eile-c9755b5825db@brauner> Acked-by: Jan Kara Acked-by: Jeff Layton Signed-off-by: Christian Brauner --- mm/shmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 02e62fccc80d..69595d341882 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4586,7 +4586,7 @@ static struct file_system_type shmem_fs_type = { #endif .kill_sb = kill_litter_super, #ifdef CONFIG_SHMEM - .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, #else .fs_flags = FS_USERNS_MOUNT, #endif -- cgit v1.2.3 From a501a0703044f00180d7697b32cacd7ff46d02d8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 20 Sep 2023 04:53:35 +0100 Subject: mm: report success more often from filemap_map_folio_range() Even though we had successfully mapped the relevant page, we would rarely return success from filemap_map_folio_range(). That leads to falling back from the VMA lock path to the mmap_lock path, which is a speed & scalability issue. Found by inspection. Link: https://lkml.kernel.org/r/20230920035336.854212-1-willy@infradead.org Fixes: 617c28ecab22 ("filemap: batch PTE mappings") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Yin Fengwei Cc: Dave Hansen Cc: David Hildenbrand Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 4ea4387053e8..f0a15ce1bd1b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3503,7 +3503,7 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); folio_ref_add(folio, count); - if (in_range(vmf->address, addr, count)) + if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } @@ -3517,7 +3517,7 @@ skip: if (count) { set_pte_range(vmf, folio, page, count, addr); folio_ref_add(folio, count); - if (in_range(vmf->address, addr, count)) + if (in_range(vmf->address, addr, count * PAGE_SIZE)) ret = VM_FAULT_NOPAGE; } -- cgit v1.2.3 From 935d4f0c6dc8b3533e6e39346de7389a84490178 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 22 Sep 2023 12:58:03 +0100 Subject: mm: hugetlb: add huge page size param to set_huge_pte_at() Patch series "Fix set_huge_pte_at() panic on arm64", v2. This series fixes a bug in arm64's implementation of set_huge_pte_at(), which can result in an unprivileged user causing a kernel panic. The problem was triggered when running the new uffd poison mm selftest for HUGETLB memory. This test (and the uffd poison feature) was merged for v6.5-rc7. Ideally, I'd like to get this fix in for v6.6 and I've cc'ed stable (correctly this time) to get it backported to v6.5, where the issue first showed up. Description of Bug ================== arm64's huge pte implementation supports multiple huge page sizes, some of which are implemented in the page table with multiple contiguous entries. So set_huge_pte_at() needs to work out how big the logical pte is, so that it can also work out how many physical ptes (or pmds) need to be written. It previously did this by grabbing the folio out of the pte and querying its size. However, there are cases when the pte being set is actually a swap entry. But this also used to work fine, because for huge ptes, we only ever saw migration entries and hwpoison entries. And both of these types of swap entries have a PFN embedded, so the code would grab that and everything still worked out. But over time, more calls to set_huge_pte_at() have been added that set swap entry types that do not embed a PFN. And this causes the code to go bang. The triggering case is for the uffd poison test, commit 99aa77215ad0 ("selftests/mm: add uffd unit test for UFFDIO_POISON"), which causes a PTE_MARKER_POISONED swap entry to be set, coutesey of commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") - added in v6.5-rc7. Although review shows that there are other call sites that set PTE_MARKER_UFFD_WP (which also has no PFN), these don't trigger on arm64 because arm64 doesn't support UFFD WP. If CONFIG_DEBUG_VM is enabled, we do at least get a BUG(), but otherwise, it will dereference a bad pointer in page_folio(): static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) { VM_BUG_ON(!is_migration_entry(entry) && !is_hwpoison_entry(entry)); return page_folio(pfn_to_page(swp_offset_pfn(entry))); } Fix === The simplest fix would have been to revert the dodgy cleanup commit 18f3962953e4 ("mm: hugetlb: kill set_huge_swap_pte_at()"), but since things have moved on, this would have required an audit of all the new set_huge_pte_at() call sites to see if they should be converted to set_huge_swap_pte_at(). As per the original intent of the change, it would also leave us open to future bugs when people invariably get it wrong and call the wrong helper. So instead, I've added a huge page size parameter to set_huge_pte_at(). This means that the arm64 code has the size in all cases. It's a bigger change, due to needing to touch the arches that implement the function, but it is entirely mechanical, so in my view, low risk. I've compile-tested all touched arches; arm64, parisc, powerpc, riscv, s390, sparc (and additionally x86_64). I've additionally booted and run mm selftests against arm64, where I observe the uffd poison test is fixed, and there are no other regressions. This patch (of 2): In order to fix a bug, arm64 needs to be told the size of the huge page for which the pte is being set in set_huge_pte_at(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. No behavioral changes intended. Link: https://lkml.kernel.org/r/20230922115804.2043771-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20230922115804.2043771-2-ryan.roberts@arm.com Fixes: 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") Signed-off-by: Ryan Roberts Reviewed-by: Christophe Leroy [powerpc 8xx] Reviewed-by: Lorenzo Stoakes [vmalloc change] Cc: Alexandre Ghiti Cc: Albert Ou Cc: Alexander Gordeev Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Axel Rasmussen Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: David S. Miller Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Xu Cc: Qi Zheng Cc: Ryan Roberts Cc: SeongJae Park Cc: Sven Schnelle Cc: Uladzislau Rezki (Sony) Cc: Vasily Gorbik Cc: Will Deacon Cc: [6.5+] Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 2 +- arch/arm64/mm/hugetlbpage.c | 6 ++-- arch/parisc/include/asm/hugetlb.h | 2 +- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 3 +- arch/powerpc/mm/book3s64/hugetlbpage.c | 5 ++- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 3 +- arch/powerpc/mm/nohash/8xx.c | 3 +- arch/powerpc/mm/pgtable.c | 3 +- arch/riscv/include/asm/hugetlb.h | 3 +- arch/riscv/mm/hugetlbpage.c | 3 +- arch/s390/include/asm/hugetlb.h | 6 ++-- arch/s390/mm/hugetlbpage.c | 8 ++++- arch/sparc/include/asm/hugetlb.h | 6 ++-- arch/sparc/mm/hugetlbpage.c | 8 ++++- include/asm-generic/hugetlb.h | 2 +- include/linux/hugetlb.h | 6 ++-- mm/damon/vaddr.c | 3 +- mm/hugetlb.c | 43 +++++++++++++----------- mm/migrate.c | 7 ++-- mm/rmap.c | 23 ++++++++++--- mm/vmalloc.c | 2 +- 22 files changed, 100 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index f43a38ac1779..2ddc33d93b13 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -28,7 +28,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags); #define arch_make_huge_pte arch_make_huge_pte #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); + pte_t *ptep, pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 9c52718ea750..a7f8c8db3425 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -249,7 +249,7 @@ static inline struct folio *hugetlb_swap_entry_to_folio(swp_entry_t entry) } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { size_t pgsize; int i; @@ -571,5 +571,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index f7f078c2872c..72daacc472a0 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -6,7 +6,7 @@ #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); + pte_t *ptep, pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index a8a1a7c1e16e..a9f7e21f6656 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -140,7 +140,7 @@ static void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) + pte_t *ptep, pte_t entry, unsigned long sz) { __set_huge_pte_at(mm, addr, ptep, entry); } diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index de092b04ee1a..92df40c6cc6b 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -46,7 +46,8 @@ static inline int check_and_get_huge_psize(int shift) } #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz); #define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/mm/book3s64/hugetlbpage.c b/arch/powerpc/mm/book3s64/hugetlbpage.c index 3bc0eb21b2a0..5a2e512e96db 100644 --- a/arch/powerpc/mm/book3s64/hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/hugetlbpage.c @@ -143,11 +143,14 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { + unsigned long psize; if (radix_enabled()) return radix__huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + + psize = huge_page_size(hstate_vma(vma)); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } void __init hugetlbpage_init_defaultsize(void) diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 17075c78d4bc..35fd2a95be24 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -47,6 +47,7 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, pte_t old_pte, pte_t pte) { struct mm_struct *mm = vma->vm_mm; + unsigned long psize = huge_page_size(hstate_vma(vma)); /* * POWER9 NMMU must flush the TLB after clearing the PTE before @@ -58,5 +59,5 @@ void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, atomic_read(&mm->context.copros) > 0) radix__flush_hugetlb_page(vma, addr); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index dbbfe897455d..a642a7929892 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -91,7 +91,8 @@ static int __ref __early_map_kernel_hugepage(unsigned long va, phys_addr_t pa, if (new && WARN_ON(pte_present(*ptep) && pgprot_val(prot))) return -EINVAL; - set_huge_pte_at(&init_mm, va, ptep, pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot))); + set_huge_pte_at(&init_mm, va, ptep, + pte_mkhuge(pfn_pte(pa >> PAGE_SHIFT, prot)), psize); return 0; } diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 3f86fd217690..3ba9fe411604 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -288,7 +288,8 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, } #if defined(CONFIG_PPC_8xx) -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned long sz) { pmd_t *pmd = pmd_off(mm, addr); pte_basic_t val; diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 34e24f078cc1..4c5b0e929890 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -18,7 +18,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pte); + unsigned long addr, pte_t *ptep, pte_t pte, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 96225a8533ad..e4a2ace92dbe 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -180,7 +180,8 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) + pte_t pte, + unsigned long sz) { int i, pte_num; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index f07267875a19..deb198a61039 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -16,6 +16,8 @@ #define hugepages_supported() (MACHINE_HAS_EDAT1) void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, @@ -65,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(ptep), pte); if (changed) { huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; } @@ -74,7 +76,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index c718f2a0de94..297a6d897d5a 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -142,7 +142,7 @@ static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste) __storage_key_init_range(paddr, paddr + size - 1); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { unsigned long rste; @@ -163,6 +163,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(rste)); } +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, pte); +} + pte_t huge_ptep_get(pte_t *ptep) { return __rste_to_pte(pte_val(*ptep)); diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 0a26cca24232..c714ca6a05aa 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -14,6 +14,8 @@ extern struct pud_huge_patch_entry __pud_huge_patch, __pud_huge_patch_end; #define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz); +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR @@ -32,7 +34,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t old_pte = *ptep; - set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); + __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(old_pte)); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -42,7 +44,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, { int changed = !pte_same(*ptep, pte); if (changed) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); flush_tlb_page(vma, addr); } return changed; diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d7018823206c..b432500c13a5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -328,7 +328,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return pte_offset_huge(pmd, addr); } -void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, +void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { unsigned int nptes, orig_shift, shift; @@ -364,6 +364,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, orig_shift); } +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned long sz) +{ + __set_huge_pte_at(mm, addr, ptep, entry); +} + pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 4da02798a00b..6dcf4d576970 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -76,7 +76,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, #ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { set_pte_at(mm, addr, ptep, pte); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5b2626063f4f..a30686e649f7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -984,7 +984,9 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { - set_huge_pte_at(vma->vm_mm, addr, ptep, pte); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize); } #endif @@ -1173,7 +1175,7 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, } static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) + pte_t *ptep, pte_t pte, unsigned long sz) { } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 4c81a9dbd044..cf8a9fc5c9d1 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -341,13 +341,14 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, bool referenced = false; pte_t entry = huge_ptep_get(pte); struct folio *folio = pfn_folio(pte_pfn(entry)); + unsigned long psize = huge_page_size(hstate_vma(vma)); folio_get(folio); if (pte_young(entry)) { referenced = true; entry = pte_mkold(entry); - set_huge_pte_at(mm, addr, pte, entry); + set_huge_pte_at(mm, addr, pte, entry, psize); } #ifdef CONFIG_MMU_NOTIFIER diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ba6d39b71cb1..52d26072dfda 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4980,7 +4980,7 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte) static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, - struct folio *new_folio, pte_t old) + struct folio *new_folio, pte_t old, unsigned long sz) { pte_t newpte = make_huge_pte(vma, &new_folio->page, 1); @@ -4988,7 +4988,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add hugepage_add_new_anon_rmap(new_folio, vma, addr); if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(vma->vm_mm, addr, ptep, newpte); + set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); folio_set_hugetlb_migratable(new_folio); } @@ -5065,7 +5065,7 @@ again: } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_hugetlb_entry_migration(entry))) { swp_entry_t swp_entry = pte_to_swp_entry(entry); bool uffd_wp = pte_swp_uffd_wp(entry); @@ -5080,18 +5080,18 @@ again: entry = swp_entry_to_pte(swp_entry); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); - set_huge_pte_at(src, addr, src_pte, entry); + set_huge_pte_at(src, addr, src_pte, entry, sz); } if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_pte_at(dst, addr, dst_pte, entry, sz); } else if (unlikely(is_pte_marker(entry))) { pte_marker marker = copy_pte_marker( pte_to_swp_entry(entry), dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, - make_pte_marker(marker)); + make_pte_marker(marker), sz); } else { entry = huge_ptep_get(src_pte); pte_folio = page_folio(pte_page(entry)); @@ -5145,7 +5145,7 @@ again: goto again; } hugetlb_install_folio(dst_vma, dst_pte, addr, - new_folio, src_pte_old); + new_folio, src_pte_old, sz); spin_unlock(src_ptl); spin_unlock(dst_ptl); continue; @@ -5166,7 +5166,7 @@ again: if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } spin_unlock(src_ptl); @@ -5184,7 +5184,8 @@ again: } static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, - unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte) + unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte, + unsigned long sz) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; @@ -5202,7 +5203,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); - set_huge_pte_at(mm, new_addr, dst_pte, pte); + set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); if (src_ptl != dst_ptl) spin_unlock(src_ptl); @@ -5259,7 +5260,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, if (!dst_pte) break; - move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte); + move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz); } if (shared_pmd) @@ -5337,7 +5338,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (pte_swp_uffd_wp_any(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP)); + make_pte_marker(PTE_MARKER_UFFD_WP), + sz); else huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); @@ -5371,7 +5373,8 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP)); + make_pte_marker(PTE_MARKER_UFFD_WP), + sz); hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); @@ -5676,7 +5679,7 @@ retry_avoidcopy: hugepage_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) newpte = huge_pte_mkuffd_wp(newpte); - set_huge_pte_at(mm, haddr, ptep, newpte); + set_huge_pte_at(mm, haddr, ptep, newpte, huge_page_size(h)); folio_set_hugetlb_migratable(new_folio); /* Make the old page be freed below */ new_folio = old_folio; @@ -5972,7 +5975,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, */ if (unlikely(pte_marker_uffd_wp(old_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); - set_huge_pte_at(mm, haddr, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { @@ -6261,7 +6264,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, } _dst_pte = make_pte_marker(PTE_MARKER_POISONED); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, + huge_page_size(h)); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -6412,7 +6416,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (wp_enabled) _dst_pte = huge_pte_mkuffd_wp(_dst_pte); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, huge_page_size(h)); hugetlb_count_add(pages_per_huge_page(h), dst_mm); @@ -6598,7 +6602,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, else if (uffd_wp_resolve) newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) - set_huge_pte_at(mm, address, ptep, newpte); + set_huge_pte_at(mm, address, ptep, newpte, psize); } else if (unlikely(is_pte_marker(pte))) { /* No other markers apply for now. */ WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); @@ -6623,7 +6627,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma, if (unlikely(uffd_wp)) /* Safe to modify directly (none->non-present). */ set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP)); + make_pte_marker(PTE_MARKER_UFFD_WP), + psize); } spin_unlock(ptl); } diff --git a/mm/migrate.c b/mm/migrate.c index b7fa020003f3..2053b54556ca 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -243,7 +243,9 @@ static bool remove_migration_pte(struct folio *folio, #ifdef CONFIG_HUGETLB_PAGE if (folio_test_hugetlb(folio)) { - unsigned int shift = huge_page_shift(hstate_vma(vma)); + struct hstate *h = hstate_vma(vma); + unsigned int shift = huge_page_shift(h); + unsigned long psize = huge_page_size(h); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) @@ -251,7 +253,8 @@ static bool remove_migration_pte(struct folio *folio, rmap_flags); else page_dup_file_rmap(new, true); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte, + psize); } else #endif { diff --git a/mm/rmap.c b/mm/rmap.c index ec7f8e6c9e48..9f795b93cf40 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1480,6 +1480,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; + unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1511,6 +1512,9 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + + /* We need the huge page size for set_huge_pte_at() */ + hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); @@ -1628,7 +1632,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_pte_at(mm, address, pvmw.pte, pteval); + set_huge_pte_at(mm, address, pvmw.pte, pteval, + hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -1820,6 +1825,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; unsigned long pfn; + unsigned long hsz = 0; /* * When racing against e.g. zap_pte_range() on another cpu, @@ -1855,6 +1861,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + + /* We need the huge page size for set_huge_pte_at() */ + hsz = huge_page_size(hstate_vma(vma)); } mmu_notifier_invalidate_range_start(&range); @@ -2020,7 +2029,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, pteval = swp_entry_to_pte(make_hwpoison_entry(subpage)); if (folio_test_hugetlb(folio)) { hugetlb_count_sub(folio_nr_pages(folio), mm); - set_huge_pte_at(mm, address, pvmw.pte, pteval); + set_huge_pte_at(mm, address, pvmw.pte, pteval, + hsz); } else { dec_mm_counter(mm, mm_counter(&folio->page)); set_pte_at(mm, address, pvmw.pte, pteval); @@ -2044,7 +2054,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (arch_unmap_one(mm, vma, address, pteval) < 0) { if (folio_test_hugetlb(folio)) - set_huge_pte_at(mm, address, pvmw.pte, pteval); + set_huge_pte_at(mm, address, pvmw.pte, + pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; @@ -2058,7 +2069,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (anon_exclusive && page_try_share_anon_rmap(subpage)) { if (folio_test_hugetlb(folio)) - set_huge_pte_at(mm, address, pvmw.pte, pteval); + set_huge_pte_at(mm, address, pvmw.pte, + pteval, hsz); else set_pte_at(mm, address, pvmw.pte, pteval); ret = false; @@ -2090,7 +2102,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); if (folio_test_hugetlb(folio)) - set_huge_pte_at(mm, address, pvmw.pte, swp_pte); + set_huge_pte_at(mm, address, pvmw.pte, swp_pte, + hsz); else set_pte_at(mm, address, pvmw.pte, swp_pte); trace_set_migration_pte(address, pte_val(swp_pte), diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ef8599d394fd..a3fedb3ee0db 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -111,7 +111,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t entry = pfn_pte(pfn, prot); entry = arch_make_huge_pte(entry, ilog2(size), 0); - set_huge_pte_at(&init_mm, addr, pte, entry); + set_huge_pte_at(&init_mm, addr, pte, entry, size); pfn += PFN_DOWN(size); continue; } -- cgit v1.2.3 From ca56489c2f1df516801eeff06129d2e9e282ab7b Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Fri, 22 Sep 2023 19:22:11 +0200 Subject: mm: zswap: fix potential memory corruption on duplicate store While stress-testing zswap a memory corruption was happening when writing back pages. __frontswap_store used to check for duplicate entries before attempting to store a page in zswap, this was because if the store fails the old entry isn't removed from the tree. This change removes duplicate entries in zswap_store before the actual attempt. [cerasuolodomenico@gmail.com: add a warning and a comment, per Johannes] Link: https://lkml.kernel.org/r/20230925130002.1929369-1-cerasuolodomenico@gmail.com Link: https://lkml.kernel.org/r/20230922172211.1704917-1-cerasuolodomenico@gmail.com Fixes: 42c06a0e8ebe ("mm: kill frontswap") Signed-off-by: Domenico Cerasuolo Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 412b1409a0d7..083c693602b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1218,6 +1218,19 @@ bool zswap_store(struct folio *folio) if (!zswap_enabled || !tree) return false; + /* + * If this is a duplicate, it must be removed before attempting to store + * it, otherwise, if the store fails the old page won't be removed from + * the tree, and it might be written back overriding the new data. + */ + spin_lock(&tree->lock); + dupentry = zswap_rb_search(&tree->rbroot, offset); + if (dupentry) { + zswap_duplicate_entry++; + zswap_invalidate_entry(tree, dupentry); + } + spin_unlock(&tree->lock); + /* * XXX: zswap reclaim does not work with cgroups yet. Without a * cgroup-aware entry LRU, we will push out entries system-wide based on @@ -1333,7 +1346,14 @@ insert_entry: /* map */ spin_lock(&tree->lock); + /* + * A duplicate entry should have been removed at the beginning of this + * function. Since the swap entry should be pinned, if a duplicate is + * found again here it means that something went wrong in the swap + * cache. + */ while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { + WARN_ON(1); zswap_duplicate_entry++; zswap_invalidate_entry(tree, dupentry); } -- cgit v1.2.3 From 4597648fddeadef5877610d693af11906aa666ac Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 21 Sep 2023 09:38:29 +0200 Subject: mm, memcg: reconsider kmem.limit_in_bytes deprecation This reverts commits 86327e8eb94c ("memcg: drop kmem.limit_in_bytes") and partially reverts 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes") which have incrementally removed support for the kernel memory accounting hard limit. Unfortunately it has turned out that there is still userspace depending on the existence of memory.kmem.limit_in_bytes [1]. The underlying functionality is not really required but the non-existent file just confuses the userspace which fails in the result. The patch to fix this on the userspace side has been submitted but it is hard to predict how it will propagate through the maze of 3rd party consumers of the software. Now, reverting alone 86327e8eb94c is not an option because there is another set of userspace which cannot cope with ENOTSUPP returned when writing to the file. Therefore we have to go and revisit 58056f77502f as well. There are two ways to go ahead. Either we give up on the deprecation and fully revert 58056f77502f as well or we can keep kmem.limit_in_bytes but make the write a noop and warn about the fact. This should work for both known breaking workloads which depend on the existence but do not depend on the hard limit enforcement. Note to backporters to stable trees. a8c49af3be5f ("memcg: add per-memcg total kernel memory stat") introduced in 4.18 has added memcg_account_kmem so the accounting is not done by obj_cgroup_charge_pages directly for v1 anymore. Prior kernels need to add it explicitly (thanks to Johannes for pointing this out). [akpm@linux-foundation.org: fix build - remove unused local] Link: http://lkml.kernel.org/r/20230920081101.GA12096@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net [1] Link: https://lkml.kernel.org/r/ZRE5VJozPZt9bRPy@dhcp22.suse.cz Fixes: 86327e8eb94c ("memcg: drop kmem.limit_in_bytes") Fixes: 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes") Signed-off-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Greg Kroah-Hartman Cc: Jeremi Piotrowski Cc: Muchun Song Cc: Roman Gushchin Cc: Tejun heo Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 7 +++++++ mm/memcontrol.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 5f502bf68fbc..ff456871bf4b 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -92,6 +92,13 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node + memory.kmem.limit_in_bytes Deprecated knob to set and read the kernel + memory hard limit. Kernel hard limit is not + supported since 5.16. Writing any value to + do file will not have any effect same as if + nokmem kernel parameter was specified. + Kernel memory is still charged and reported + by memory.kmem.usage_in_bytes. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d13dde2f8b56..5b009b233ab8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3867,6 +3867,13 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _MEMSWAP: ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; + case _KMEM: + pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. " + "Writing any value to this file has no effect. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); + ret = 0; + break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); break; @@ -5077,6 +5084,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif + { + .name = "kmem.limit_in_bytes", + .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), + .write = mem_cgroup_write, + .read_u64 = mem_cgroup_read_u64, + }, { .name = "kmem.usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), -- cgit v1.2.3 From 45120b15743fa7c0aa53d5db6dfb4c8f87be4abd Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 25 Sep 2023 15:20:59 +0800 Subject: mm/damon/vaddr-test: fix memory leak in damon_do_test_apply_three_regions() When CONFIG_DAMON_VADDR_KUNIT_TEST=y and making CONFIG_DEBUG_KMEMLEAK=y and CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y, the below memory leak is detected. Since commit 9f86d624292c ("mm/damon/vaddr-test: remove unnecessary variables"), the damon_destroy_ctx() is removed, but still call damon_new_target() and damon_new_region(), the damon_region which is allocated by kmem_cache_alloc() in damon_new_region() and the damon_target which is allocated by kmalloc in damon_new_target() are not freed. And the damon_region which is allocated in damon_new_region() in damon_set_regions() is also not freed. So use damon_destroy_target to free all the damon_regions and damon_target. unreferenced object 0xffff888107c9a940 (size 64): comm "kunit_try_catch", pid 1069, jiffies 4294670592 (age 732.761s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 06 00 00 00 6b 6b 6b 6b ............kkkk 60 c7 9c 07 81 88 ff ff f8 cb 9c 07 81 88 ff ff `............... backtrace: [] kmalloc_trace+0x27/0xa0 [] damon_new_target+0x3f/0x1b0 [] damon_do_test_apply_three_regions.constprop.0+0x95/0x3e0 [] damon_test_apply_three_regions1+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff8881079cc740 (size 56): comm "kunit_try_catch", pid 1069, jiffies 4294670592 (age 732.761s) hex dump (first 32 bytes): 05 00 00 00 00 00 00 00 14 00 00 00 00 00 00 00 ................ 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b kkkkkkkk....kkkk backtrace: [] damon_new_region+0x22/0x1c0 [] damon_do_test_apply_three_regions.constprop.0+0xd1/0x3e0 [] damon_test_apply_three_regions1+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff888107c9ac40 (size 64): comm "kunit_try_catch", pid 1071, jiffies 4294670595 (age 732.843s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 06 00 00 00 6b 6b 6b 6b ............kkkk a0 cc 9c 07 81 88 ff ff 78 a1 76 07 81 88 ff ff ........x.v..... backtrace: [] kmalloc_trace+0x27/0xa0 [] damon_new_target+0x3f/0x1b0 [] damon_do_test_apply_three_regions.constprop.0+0x95/0x3e0 [] damon_test_apply_three_regions2+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff8881079ccc80 (size 56): comm "kunit_try_catch", pid 1071, jiffies 4294670595 (age 732.843s) hex dump (first 32 bytes): 05 00 00 00 00 00 00 00 14 00 00 00 00 00 00 00 ................ 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b kkkkkkkk....kkkk backtrace: [] damon_new_region+0x22/0x1c0 [] damon_do_test_apply_three_regions.constprop.0+0xd1/0x3e0 [] damon_test_apply_three_regions2+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff888107c9af40 (size 64): comm "kunit_try_catch", pid 1073, jiffies 4294670597 (age 733.011s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 06 00 00 00 6b 6b 6b 6b ............kkkk 20 a2 76 07 81 88 ff ff b8 a6 76 07 81 88 ff ff .v.......v..... backtrace: [] kmalloc_trace+0x27/0xa0 [] damon_new_target+0x3f/0x1b0 [] damon_do_test_apply_three_regions.constprop.0+0x95/0x3e0 [] damon_test_apply_three_regions3+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff88810776a200 (size 56): comm "kunit_try_catch", pid 1073, jiffies 4294670597 (age 733.011s) hex dump (first 32 bytes): 05 00 00 00 00 00 00 00 14 00 00 00 00 00 00 00 ................ 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b kkkkkkkk....kkkk backtrace: [] damon_new_region+0x22/0x1c0 [] damon_do_test_apply_three_regions.constprop.0+0xd1/0x3e0 [] damon_test_apply_three_regions3+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff88810776a740 (size 56): comm "kunit_try_catch", pid 1073, jiffies 4294670597 (age 733.025s) hex dump (first 32 bytes): 3d 00 00 00 00 00 00 00 3f 00 00 00 00 00 00 00 =.......?....... 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b kkkkkkkk....kkkk backtrace: [] damon_new_region+0x22/0x1c0 [] damon_set_regions+0x4c2/0x8e0 [] damon_do_test_apply_three_regions.constprop.0+0xfb/0x3e0 [] damon_test_apply_three_regions3+0x21e/0x260 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff888108038240 (size 64): comm "kunit_try_catch", pid 1075, jiffies 4294670600 (age 733.022s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 03 00 00 00 6b 6b 6b 6b ............kkkk 48 ad 76 07 81 88 ff ff 98 ae 76 07 81 88 ff ff H.v.......v..... backtrace: [] kmalloc_trace+0x27/0xa0 [] damon_new_target+0x3f/0x1b0 [] damon_do_test_apply_three_regions.constprop.0+0x95/0x3e0 [] damon_test_apply_three_regions4+0x1cd/0x210 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 unreferenced object 0xffff88810776ad28 (size 56): comm "kunit_try_catch", pid 1075, jiffies 4294670600 (age 733.022s) hex dump (first 32 bytes): 05 00 00 00 00 00 00 00 07 00 00 00 00 00 00 00 ................ 6b 6b 6b 6b 6b 6b 6b 6b 00 00 00 00 6b 6b 6b 6b kkkkkkkk....kkkk backtrace: [] damon_new_region+0x22/0x1c0 [] damon_set_regions+0x4c2/0x8e0 [] damon_do_test_apply_three_regions.constprop.0+0xfb/0x3e0 [] damon_test_apply_three_regions4+0x1cd/0x210 [] kunit_generic_run_threadfn_adapter+0x4a/0x90 [] kthread+0x2b6/0x380 [] ret_from_fork+0x2d/0x70 [] ret_from_fork_asm+0x11/0x20 Link: https://lkml.kernel.org/r/20230925072100.3725620-1-ruanjinjie@huawei.com Fixes: 9f86d624292c ("mm/damon/vaddr-test: remove unnecessary variables") Signed-off-by: Jinjie Ruan Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/vaddr-test.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h index c4b455b5ee30..dcf1ca6b31cc 100644 --- a/mm/damon/vaddr-test.h +++ b/mm/damon/vaddr-test.h @@ -148,6 +148,8 @@ static void damon_do_test_apply_three_regions(struct kunit *test, KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]); KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]); } + + damon_destroy_target(t); } /* -- cgit v1.2.3 From 24526268f4e38c9ec0c4a30de4f37ad2a2a84e47 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 20 Sep 2023 15:32:42 -0700 Subject: mm: mempolicy: keep VMA walk if both MPOL_MF_STRICT and MPOL_MF_MOVE are specified When calling mbind() with MPOL_MF_{MOVE|MOVEALL} | MPOL_MF_STRICT, kernel should attempt to migrate all existing pages, and return -EIO if there is misplaced or unmovable page. Then commit 6f4576e3687b ("mempolicy: apply page table walker on queue_pages_range()") messed up the return value and didn't break VMA scan early ianymore when MPOL_MF_STRICT alone. The return value problem was fixed by commit a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified"), but it broke the VMA walk early if unmovable page is met, it may cause some pages are not migrated as expected. The code should conceptually do: if (MPOL_MF_MOVE|MOVEALL) scan all vmas try to migrate the existing pages return success else if (MPOL_MF_MOVE* | MPOL_MF_STRICT) scan all vmas try to migrate the existing pages return -EIO if unmovable or migration failed else /* MPOL_MF_STRICT alone */ break early if meets unmovable and don't call mbind_range() at all else /* none of those flags */ check the ranges in test_walk, EFAULT without mbind_range() if discontig. Fixed the behavior. Link: https://lkml.kernel.org/r/20230920223242.3425775-1-yang@os.amperecomputing.com Fixes: a7f40cfe3b7a ("mm: mempolicy: make mbind() return -EIO when MPOL_MF_STRICT is specified") Signed-off-by: Yang Shi Cc: Hugh Dickins Cc: Suren Baghdasaryan Cc: Matthew Wilcox Cc: Michal Hocko Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Rafael Aquini Cc: Kirill A. Shutemov Cc: David Rientjes Cc: [4.9+] Signed-off-by: Andrew Morton --- mm/mempolicy.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 42b5567e3773..f1b00d6ac7ee 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -426,6 +426,7 @@ struct queue_pages { unsigned long start; unsigned long end; struct vm_area_struct *first; + bool has_unmovable; }; /* @@ -446,9 +447,8 @@ static inline bool queue_folio_required(struct folio *folio, /* * queue_folios_pmd() has three possible return values: * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. huge zero page. - * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. + * special page is met, i.e. zero page, or unmovable page is found + * but continue walking (indicated by queue_pages.has_unmovable). * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an * existing folio was already on a node that does not follow the * policy. @@ -479,7 +479,7 @@ static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || migrate_folio_add(folio, qp->pagelist, flags)) { - ret = 1; + qp->has_unmovable = true; goto unlock; } } else @@ -495,9 +495,8 @@ unlock: * * queue_folios_pte_range() has three possible return values: * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page. - * 1 - there is unmovable folio, and MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. + * special page is met, i.e. zero page, or unmovable page is found + * but continue walking (indicated by queue_pages.has_unmovable). * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already * on a node that does not follow the policy. */ @@ -508,7 +507,6 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, struct folio *folio; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; - bool has_unmovable = false; pte_t *pte, *mapped_pte; pte_t ptent; spinlock_t *ptl; @@ -538,11 +536,12 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, if (!queue_folio_required(folio, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* MPOL_MF_STRICT must be specified if we get here */ - if (!vma_migratable(vma)) { - has_unmovable = true; - break; - } + /* + * MPOL_MF_STRICT must be specified if we get here. + * Continue walking vmas due to MPOL_MF_MOVE* flags. + */ + if (!vma_migratable(vma)) + qp->has_unmovable = true; /* * Do not abort immediately since there may be @@ -550,16 +549,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, * need migrate other LRU pages. */ if (migrate_folio_add(folio, qp->pagelist, flags)) - has_unmovable = true; + qp->has_unmovable = true; } else break; } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); - if (has_unmovable) - return 1; - return addr != end ? -EIO : 0; } @@ -599,7 +595,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Detecting misplaced folio but allow migrating folios which * have been queued. */ - ret = 1; + qp->has_unmovable = true; goto unlock; } @@ -620,7 +616,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, * Failed to isolate folio but allow migrating pages * which have been queued. */ - ret = 1; + qp->has_unmovable = true; } unlock: spin_unlock(ptl); @@ -756,12 +752,15 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .start = start, .end = end, .first = NULL, + .has_unmovable = false, }; const struct mm_walk_ops *ops = lock_vma ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); + if (qp.has_unmovable) + err = 1; if (!qp.first) /* whole range in hole */ err = -EFAULT; @@ -1358,7 +1357,7 @@ static long do_mbind(unsigned long start, unsigned long len, putback_movable_pages(&pagelist); } - if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) + if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) err = -EIO; } else { up_out: -- cgit v1.2.3