From 20c897eadf13ee4305af62dbdfe7819307ddf0bc Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Mon, 3 Jul 2023 19:36:41 +0530 Subject: mm: madvise: fix uneven accounting of psi A folio turns into a Workingset during: 1) shrink_active_list() placing the folio from active to inactive list. 2) When a workingset transition is happening during the folio refault. And when Workingset is set on a folio, PSI for memory can be accounted during a) That folio is being reclaimed and b) Refault of that folio, for usual reclaims. This accounting of PSI for memory is not consistent for reclaim + refault operation between usual reclaim and madvise(COLD/PAGEOUT) which deactivate or proactively reclaim a folio: a) A folio started at inactive and moved to active as part of accesses. Workingset is absent on the folio thus refault of it when reclaimed through MADV_PAGEOUT operation doesn't account for PSI. b) When the same folio transition from inactive->active and then to inactive through shrink_active_list(). Workingset is set on the folio thus refault of it when reclaimed through MADV_PAGEOUT operation accounts for PSI. c) When the same folio is part of active list directly as a result of folio refault and this was a workingset folio prior to eviction. Workingset is set on the folio thus the refault of it when reclaimed through MADV_PAGEOUT/MADV_COLD operation accounts for PSI. d) MADV_COLD transfers the folio from active list to inactive list. Such folios may not have the Workingset thus refault operation on such folio doesn't account for PSI. As said above, refault operation caused because of MADV_PAGEOUT on a folio is accounts for memory PSI in b) and c) but not in a). Refault caused by the reclaim of a folio on which MADV_COLD is performed accounts memory PSI in c) but not in d). These behaviours are inconsistent w.r.t usual reclaim + refault operation. Make this PSI accounting always consistent by turning a folio into a workingset one whenever it is leaving the active list. Also, accounting of PSI on a folio whenever it leaves the active list as part of the MADV_COLD/PAGEOUT operation helps the users whether they are operating on proper folios[1]. [1] https://lore.kernel.org/all/20230605180013.GD221380@cmpxchg.org/ Link: https://lkml.kernel.org/r/1688393201-11135-1-git-send-email-quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Suggested-by: Suren Baghdasaryan Reported-by: Sai Manobhiram Manapragada Reported-by: Pavan Kondeti Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Pavankumar Kondeti Signed-off-by: Andrew Morton --- mm/madvise.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index 886f06066622..05f97038eac3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -413,6 +413,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); + if (folio_test_active(folio)) + folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) @@ -510,6 +512,8 @@ regular_folio: */ folio_clear_referenced(folio); folio_test_clear_young(folio); + if (folio_test_active(folio)) + folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) -- cgit v1.2.3 From 3a29280afb25263c76212a8c140c29f280049ffb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 25 Jun 2023 11:33:40 +0800 Subject: mm/mm_init.c: update obsolete comment in get_pfn_range_for_nid() Since commit 633c0666b5a5 ("Memoryless nodes: drop one memoryless node boot warning"), the warning for a node with no available memory is removed. Update the corresponding comment. Link: https://lkml.kernel.org/r/20230625033340.1054103-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index a1963c3322af..d356ba59ef2a 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1681,8 +1681,7 @@ static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } * * It returns the start and end page frame of a node based on information * provided by memblock_set_node(). If called for a node - * with no available memory, a warning is printed and the start and end - * PFNs will be 0. + * with no available memory, the start and end PFNs will be 0. */ void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) -- cgit v1.2.3 From 87b11f862254396a93636f0998377ac3f6648f5f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 27 Jun 2023 10:43:49 -0700 Subject: mm: increase usage of folio_next_index() helper Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index(). Link: https://lkml.kernel.org/r/20230627174349.491803-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 4 ++-- mm/filemap.c | 8 ++++---- mm/memory.c | 2 +- mm/shmem.c | 2 +- mm/truncate.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..3d253e250871 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we diff --git a/mm/filemap.c b/mm/filemap.c index 9e44a49bbd74..c5e2c70ea046 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2075,7 +2075,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, if (!xa_is_value(folio)) { if (folio->index < *start) goto put; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2174,7 +2174,7 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) return false; if (index >= max) return false; - return index < folio->index + folio_nr_pages(folio) - 1; + return index < folio_next_index(folio) - 1; } /** @@ -2242,7 +2242,7 @@ update_start: if (folio_test_hugetlb(folio)) *start = folio->index + 1; else - *start = folio->index + folio_nr_pages(folio); + *start = folio_next_index(folio); } out: rcu_read_unlock(); @@ -2359,7 +2359,7 @@ static void filemap_get_read_batch(struct address_space *mapping, break; if (folio_test_readahead(folio)) break; - xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); diff --git a/mm/memory.c b/mm/memory.c index 603b2f419948..33f0f28c7ebc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3495,7 +3495,7 @@ void unmap_mapping_folio(struct folio *folio) VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; - last_index = folio->index + folio_nr_pages(folio) - 1; + last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; diff --git a/mm/shmem.c b/mm/shmem.c index f5af4b943e42..8dfd72bdc86a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -970,7 +970,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = lend < folio_pos(folio) + folio_size(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { - start = folio->index + folio_nr_pages(folio); + start = folio_next_index(folio); if (same_folio) end = folio->index; } diff --git a/mm/truncate.c b/mm/truncate.c index 95d1291d269b..2f28cc0e12ef 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -378,7 +378,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { - start = folio->index + folio_nr_pages(folio); + start = folio_next_index(folio); if (same_folio) end = folio->index; } -- cgit v1.2.3 From 67490031e83a008c5ce8f562e7fa3b6b83adc861 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Tue, 27 Jun 2023 20:08:32 +0800 Subject: swap: cleanup duplicated WARN_ON in add_to_avail_list Patch series "fix WARN_ON in add_to_avail_list". Empty check for plist_node is checked in add_to_avail_list and plist_add. Drop the duplicate one in add_to_avail_list. Link: https://lkml.kernel.org/r/20230627120833.2230766-1-mawupeng1@huawei.com Link: https://lkml.kernel.org/r/20230627120833.2230766-2-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 8e6dde68b389..2a4693642071 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -714,10 +714,8 @@ static void add_to_avail_list(struct swap_info_struct *p) int nid; spin_lock(&swap_avail_lock); - for_each_node(nid) { - WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + for_each_node(nid) plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); - } spin_unlock(&swap_avail_lock); } -- cgit v1.2.3 From c70699e555537b611f4cb426c26f8ab4a264a8a0 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Tue, 27 Jun 2023 20:08:33 +0800 Subject: swap: stop add to avail list if swap is full Our test finds a WARN_ON in add_to_avail_list. During add_to_avail_list, avail_lists is already in swap_avail_heads, while leads to this WARN_ON. Here is the simplified calltrace: ------------[ cut here ]------------ Call trace: add_to_avail_list+0xb8/0xc0 swap_range_free+0x110/0x138 swapcache_free_entries+0x100/0x1c0 free_swap_slot+0xbc/0xe0 put_swap_folio+0x1f0/0x2ec delete_from_swap_cache+0x6c/0xd0 folio_free_swap+0xa4/0xe4 __try_to_reclaim_swap+0x9c/0x190 free_swap_and_cache+0x84/0x88 unmap_page_range+0x31c/0x934 unmap_single_vma.isra.0+0x48/0x84 unmap_vmas+0x98/0x10c exit_mmap+0xa4/0x210 mmput+0x88/0x158 do_exit+0x284/0x970 do_group_exit+0x34/0x90 post_copy_siginfo_from_user32+0x0/0x1cc do_notify_resume+0x15c/0x470 el0_svc+0x74/0x84 el0t_64_sync_handler+0xb8/0xbc el0t_64_sync+0x190/0x194 During swapoff, try_to_unuse fails to alloc memory due to memory limit and this leads to the failure of swapoff and causes re-insertion of swap space back into swap_list. During _enable_swap_info, this swap device is added to avail list even this swap device if full. At the same time, one entry in this full swap device in released and we try to add this device into avail list and find it is already in the avail list. This causes this WARN_ON. To fix this. Don't add to avail list is swap is full. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230627120833.2230766-3-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 2a4693642071..cad0209ac67f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2328,7 +2328,10 @@ static void _enable_swap_info(struct swap_info_struct *p) * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - add_to_avail_list(p); + + /* add to available list iff swap device is not full */ + if (p->highest_bit) + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, -- cgit v1.2.3 From 15b4919a1e0703b77dd7cc0a4d9732f7f6181236 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 1 Jul 2023 11:28:52 +0800 Subject: mm: use a folio in fault_dirty_shared_page() We can replace four implicit calls to compound_head() with one by using folio. Link: https://lkml.kernel.org/r/20230701032853.258697-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 33f0f28c7ebc..e9f9944c7370 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2965,20 +2965,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); + dirtied = folio_mark_dirty(folio); + VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * Take a local copy of the address_space - folio.mapping may be zeroed + * by truncate after folio_unlock(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = page_rmapping(page); - unlock_page(page); + mapping = folio_raw_mapping(folio); + folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); -- cgit v1.2.3 From fc1878ec70ede56ee48f2d65525d4f7c6888b496 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 1 Jul 2023 11:28:53 +0800 Subject: mm: remove page_rmapping() After converting the last user to folio_raw_mapping(), we can safely remove the function. Link: https://lkml.kernel.org/r/20230701032853.258697-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/util.c | 6 ------ 2 files changed, 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818f..6d150990e35c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2170,7 +2170,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern void *page_rmapping(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* diff --git a/mm/util.c b/mm/util.c index dd12b9531ac4..5e9305189c3f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -734,12 +734,6 @@ void *vcalloc(size_t n, size_t size) } EXPORT_SYMBOL(vcalloc); -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -void *page_rmapping(struct page *page) -{ - return folio_raw_mapping(page_folio(page)); -} - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; -- cgit v1.2.3 From 626e98cb0366e66bdc2088918aecabee1fc6c4b2 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Fri, 30 Jun 2023 11:08:53 +0200 Subject: mm: make MEMFD_CREATE into a selectable config option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memfd_create() syscall, enabled by CONFIG_MEMFD_CREATE, is useful on its own even when not required by CONFIG_TMPFS or CONFIG_HUGETLBFS. Split it into its own proper bool option that can be enabled by users. Move that option into mm/ where the code itself also lies. Also add "select" statements to CONFIG_TMPFS and CONFIG_HUGETLBFS so they automatically enable CONFIG_MEMFD_CREATE as before. Link: https://lkml.kernel.org/r/20230630-config-memfd-v1-1-9acc3ae38b5a@weissschuh.net Signed-off-by: Thomas Weißschuh Tested-by: Zhangjin Wu Cc: Al Viro Cc: Christian Brauner Cc: Darrick J. Wong Signed-off-by: Andrew Morton --- fs/Kconfig | 5 ++--- mm/Kconfig | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..19975b104bc3 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -169,6 +169,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM + select MEMFD_CREATE help Tmpfs is a file system which keeps all files in virtual memory. @@ -240,6 +241,7 @@ config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) + select MEMFD_CREATE help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -264,9 +266,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off (boot command line) or hugetlb_optimize_vmemmap (sysctl). -config MEMFD_CREATE - def_bool TMPFS || HUGETLBFS - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/mm/Kconfig b/mm/Kconfig index 09130434e30d..22acffd9009d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1144,6 +1144,9 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY config IO_MAPPING bool +config MEMFD_CREATE + bool "Enable memfd_create() system call" if EXPERT + config SECRETMEM default y bool "Enable memfd_secret() system call" if EXPERT -- cgit v1.2.3 From 1279aa0656bbebbeecbd3bec0dd50faf35e5db38 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:53 +0800 Subject: mm: make show_free_areas() static All callers of show_free_areas() pass 0 and NULL, so we can directly use show_mem() instead of show_free_areas(0, NULL), which could make show_free_areas() a static function. Link: https://lkml.kernel.org/r/20230630062253.189440-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/sparc/kernel/setup_32.c | 2 +- include/linux/mm.h | 12 ------------ mm/internal.h | 6 ++++++ mm/nommu.c | 8 ++++---- mm/show_mem.c | 4 ++-- 5 files changed, 13 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index 1adf5c1c16b8..34ef7febf0d5 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -83,7 +83,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0, NULL); + show_mem(); if (!is_idle_task(current)) { local_irq_enable(); ksys_sync(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c3e3eec9008..d1ad22980ebe 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2236,18 +2236,6 @@ extern void pagefault_out_of_memory(void); #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) -/* - * Flags passed to show_mem() and show_free_areas() to suppress output in - * various contexts. - */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ - -extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); -static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) -{ - __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); -} - /* * Parameter block passed down to zap_pte_range in exceptional cases. */ diff --git a/mm/internal.h b/mm/internal.h index a7d9e980429a..721ed07d7fd6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -61,6 +61,12 @@ void page_writeback_init(void); #define COMPOUND_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) +/* + * Flags passed to __show_mem() and show_free_areas() to suppress output in + * various contexts. + */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ + /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. diff --git a/mm/nommu.c b/mm/nommu.c index c072a660ec2c..9826f6101a05 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1003,7 +1003,7 @@ error_free: enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } @@ -1236,20 +1236,20 @@ error_getting_vma: kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 01f8e9905817..09c7d036d49e 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -186,7 +186,7 @@ static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; @@ -406,7 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) struct zone *zone; printk("Mem-Info:\n"); - __show_free_areas(filter, nodemask, max_zone_idx); + show_free_areas(filter, nodemask, max_zone_idx); for_each_populated_zone(zone) { -- cgit v1.2.3 From b53e24c4f6bcc5f38b79c1c344e643bd54b69964 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 22 May 2023 17:43:09 -0700 Subject: mm: call arch_swap_restore() from unuse_pte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We would like to move away from requiring architectures to restore metadata from swap in the set_pte_at() implementation, as this is not only error-prone but adds complexity to the arch-specific code. This requires us to call arch_swap_restore() before calling swap_free() whenever pages are restored from swap. We are currently doing so everywhere except in unuse_pte(); do so there as well. Link: https://lkml.kernel.org/r/20230523004312.1807357-3-pcc@google.com Link: https://linux-review.googlesource.com/id/I68276653e612d64cde271ce1b5a99ae05d6bbc4f Signed-off-by: Peter Collingbourne Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: "Huang, Ying" Reviewed-by: Steven Price Acked-by: Catalin Marinas Cc: Alexandru Elisei Cc: Chinwen Chang Cc: Evgenii Stepanov Cc: Greg Kroah-Hartman Cc: kasan-dev Cc: "Kuan-Ying Lee (李冠穎)" Cc: Qun-Wei Lin Cc: Suren Baghdasaryan Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index cad0209ac67f..d996c335fc3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1778,6 +1778,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto setpte; } + /* + * Some architectures may have to restore extra metadata to the page + * when reading from swap. This metadata may be indexed by swap entry + * so this must be called before swap_free(). + */ + arch_swap_restore(entry, page_folio(page)); + /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); -- cgit v1.2.3 From dd767aaa2fc8f1a000df0504f6231afcafe8a8e9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:03 -0400 Subject: mm/hugetlb: handle FOLL_DUMP well in follow_page_mask() Patch series "mm/gup: Unify hugetlb, speed up thp", v4. Hugetlb has a special path for slow gup that follow_page_mask() is actually skipped completely along with faultin_page(). It's not only confusing, but also duplicating a lot of logics that generic gup already has, making hugetlb slightly special. This patchset tries to dedup the logic, by first touching up the slow gup code to be able to handle hugetlb pages correctly with the current follow page and faultin routines (where we're mostly there.. due to 10 years ago we did try to optimize thp, but half way done; more below), then at the last patch drop the special path, then the hugetlb gup will always go the generic routine too via faultin_page(). Note that hugetlb is still special for gup, mostly due to the pgtable walking (hugetlb_walk()) that we rely on which is currently per-arch. But this is still one small step forward, and the diffstat might be a proof too that this might be worthwhile. Then for the "speed up thp" side: as a side effect, when I'm looking at the chunk of code, I found that thp support is actually partially done. It doesn't mean that thp won't work for gup, but as long as **pages pointer passed over, the optimization will be skipped too. Patch 6 should address that, so for thp we now get full speed gup. For a quick number, "chrt -f 1 ./gup_test -m 512 -t -L -n 1024 -r 10" gives me 13992.50us -> 378.50us. Gup_test is an extreme case, but just to show how it affects thp gups. This patch (of 8): Firstly, the no_page_table() is meaningless for hugetlb which is a no-op there, because a hugetlb page always satisfies: - vma_is_anonymous() == false - vma->vm_ops->fault != NULL So we can already safely remove it in hugetlb_follow_page_mask(), alongside with the page* variable. Meanwhile, what we do in follow_hugetlb_page() actually makes sense for a dump: we try to fault in the page only if the page cache is already allocated. Let's do the same here for follow_page_mask() on hugetlb. It should so far has zero effect on real dumps, because that still goes into follow_hugetlb_page(). But this may start to influence a bit on follow_page() users who mimics a "dump page" scenario, but hopefully in a good way. This also paves way for unifying the hugetlb gup-slow. Link: https://lkml.kernel.org/r/20230628215310.73782-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230628215310.73782-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 9 ++------- mm/hugetlb.c | 9 +++++++++ 2 files changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 76d222ccc3ff..9c62cfa7e486 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -811,7 +811,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { pgd_t *pgd; - struct page *page; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; @@ -824,12 +823,8 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * hugetlb_follow_page_mask is only for follow_page() handling here. * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ - if (is_vm_hugetlb_page(vma)) { - page = hugetlb_follow_page_mask(vma, address, flags); - if (!page) - page = no_page_table(vma, flags); - return page; - } + if (is_vm_hugetlb_page(vma)) + return hugetlb_follow_page_mask(vma, address, flags); pgd = pgd_offset(mm, address); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 64a3239b6407..4fb396dd65bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6498,6 +6498,15 @@ out: spin_unlock(ptl); out_unlock: hugetlb_vma_unlock_read(vma); + + /* + * Fixup retval for dump requests: if pagecache doesn't exist, + * don't try to allocate a new page but just skip it. + */ + if (!page && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, address)) + page = ERR_PTR(-EFAULT); + return page; } -- cgit v1.2.3 From 458568c92953dee3716234711f1a2830a35261f3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:04 -0400 Subject: mm/hugetlb: prepare hugetlb_follow_page_mask() for FOLL_PIN follow_page() doesn't use FOLL_PIN, meanwhile hugetlb seems to not be the target of FOLL_WRITE either. However add the checks. Namely, either the need to CoW due to missing write bit, or proper unsharing on !AnonExclusive pages over R/O pins to reject the follow page. That brings this function closer to follow_hugetlb_page(). So we don't care before, and also for now. But we'll care if we switch over slow-gup to use hugetlb_follow_page_mask(). We'll also care when to return -EMLINK properly, as that's the gup internal api to mean "we should unshare". Not really needed for follow page path, though. When at it, switching the try_grab_page() to use WARN_ON_ONCE(), to be clear that it just should never fail. When error happens, instead of setting page==NULL, capture the errno instead. Link: https://lkml.kernel.org/r/20230628215310.73782-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/hugetlb.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fb396dd65bd..cc87a51ce71a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6462,13 +6462,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; pte_t *pte, entry; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; + int ret; hugetlb_vma_lock_read(vma); pte = hugetlb_walk(vma, haddr, huge_page_size(h)); @@ -6478,8 +6472,23 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); if (pte_present(entry)) { - page = pte_page(entry) + - ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = pte_page(entry); + + if (!huge_pte_write(entry)) { + if (flags & FOLL_WRITE) { + page = NULL; + goto out; + } + + if (gup_must_unshare(vma, flags, page)) { + /* Tell the caller to do unsharing */ + page = ERR_PTR(-EMLINK); + goto out; + } + } + + page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* * Note that page may be a sub-page, and with vmemmap * optimizations the page struct may be read only. @@ -6489,8 +6498,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, * try_grab_page() should always be able to get the page here, * because we hold the ptl lock and have verified pte_present(). */ - if (try_grab_page(page, flags)) { - page = NULL; + ret = try_grab_page(page, flags); + + if (WARN_ON_ONCE(ret)) { + page = ERR_PTR(ret); goto out; } } -- cgit v1.2.3 From 5502ea44f5ade35d32a397353956bc026b870400 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:05 -0400 Subject: mm/hugetlb: add page_mask for hugetlb_follow_page_mask() follow_page() doesn't need it, but we'll start to need it when unifying gup for hugetlb. Link: https://lkml.kernel.org/r/20230628215310.73782-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 +++++--- mm/gup.c | 3 ++- mm/hugetlb.c | 5 ++++- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ca3c8e10f24a..9f282f370d96 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -131,7 +131,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); + unsigned long address, unsigned int flags, + unsigned int *page_mask); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, unsigned long *, unsigned long *, long, unsigned int, int *); @@ -297,8 +298,9 @@ static inline void adjust_range_if_pmd_sharing_possible( { } -static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static inline struct page *hugetlb_follow_page_mask( + struct vm_area_struct *vma, unsigned long address, unsigned int flags, + unsigned int *page_mask) { BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } diff --git a/mm/gup.c b/mm/gup.c index 9c62cfa7e486..818d98b34dec 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -824,7 +824,8 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ if (is_vm_hugetlb_page(vma)) - return hugetlb_follow_page_mask(vma, address, flags); + return hugetlb_follow_page_mask(vma, address, flags, + &ctx->page_mask); pgd = pgd_offset(mm, address); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cc87a51ce71a..ab52214b5a75 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6454,7 +6454,8 @@ static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, } struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + unsigned int *page_mask) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; @@ -6504,6 +6505,8 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, page = ERR_PTR(ret); goto out; } + + *page_mask = (1U << huge_page_order(h)) - 1; } out: spin_unlock(ptl); -- cgit v1.2.3 From ffe1e7861211aafe12977a3ed2f11bb6fe1e77ea Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:06 -0400 Subject: mm/gup: cleanup next_page handling The only path that doesn't use generic "**pages" handling is the gate vma. Make it use the same path, meanwhile tune the next_page label upper to cover "**pages" handling. This prepares for THP handling for "**pages". Link: https://lkml.kernel.org/r/20230628215310.73782-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 818d98b34dec..d70f8f0613f4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1207,7 +1207,7 @@ static long __get_user_pages(struct mm_struct *mm, if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, - pages ? &pages[i] : NULL); + pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; @@ -1277,19 +1277,18 @@ retry: ret = PTR_ERR(page); goto out; } - - goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } +next_page: if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); ctx.page_mask = 0; } -next_page: + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; -- cgit v1.2.3 From 57edfcfd3419b4799353d8cbd6ce49da075cfdbd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:07 -0400 Subject: mm/gup: accelerate thp gup even for "pages != NULL" The acceleration of THP was done with ctx.page_mask, however it'll be ignored if **pages is non-NULL. The old optimization was introduced in 2013 in 240aadeedc4a ("mm: accelerate mm_populate() treatment of THP pages"). It didn't explain why we can't optimize the **pages non-NULL case. It's possible that at that time the major goal was for mm_populate() which should be enough back then. Optimize thp for all cases, by properly looping over each subpage, doing cache flushes, and boost refcounts / pincounts where needed in one go. This can be verified using gup_test below: # chrt -f 1 ./gup_test -m 512 -t -L -n 1024 -r 10 Before: 13992.50 ( +-8.75%) After: 378.50 (+-69.62%) Link: https://lkml.kernel.org/r/20230628215310.73782-6-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Lorenzo Stoakes Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index d70f8f0613f4..59e182634ba8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1282,16 +1282,53 @@ retry: goto out; } next_page: - if (pages) { - pages[i] = page; - flush_anon_page(vma, page, start); - flush_dcache_page(page); - ctx.page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; + + if (pages) { + struct page *subpage; + unsigned int j; + + /* + * This must be a large folio (and doesn't need to + * be the whole folio; it can be part of it), do + * the refcount work for all the subpages too. + * + * NOTE: here the page may not be the head page + * e.g. when start addr is not thp-size aligned. + * try_grab_folio() should have taken care of tail + * pages. + */ + if (page_increm > 1) { + struct folio *folio; + + /* + * Since we already hold refcount on the + * large folio, this should never fail. + */ + folio = try_grab_folio(page, page_increm - 1, + foll_flags); + if (WARN_ON_ONCE(!folio)) { + /* + * Release the 1st page ref if the + * folio is problematic, fail hard. + */ + gup_put_folio(page_folio(page), 1, + foll_flags); + ret = -EFAULT; + goto out; + } + } + + for (j = 0; j < page_increm; j++) { + subpage = nth_page(page, j); + pages[i + j] = subpage; + flush_anon_page(vma, subpage, start + j * PAGE_SIZE); + flush_dcache_page(subpage); + } + } + i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; -- cgit v1.2.3 From 4849807114b83e1897381ed3f851632f376a0b7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:08 -0400 Subject: mm/gup: retire follow_hugetlb_page() Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- include/linux/hugetlb.h | 12 --- mm/gup.c | 19 ---- mm/hugetlb.c | 224 ------------------------------------------------ 4 files changed, 1 insertion(+), 256 deletions(-) (limited to 'mm') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b..ae711f1d7a83 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * * We also don't do userfault handling during * coredumping. hugetlbfs has the special - * follow_hugetlb_page() to skip missing pages in the + * hugetlb_follow_page_mask() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f282f370d96..9bc3c2d71b71 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask); -long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, unsigned long *, unsigned long *, - long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask( BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } -static inline long follow_hugetlb_page(struct mm_struct *mm, - struct vm_area_struct *vma, struct page **pages, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *nonblocking) -{ - BUG(); - return 0; -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, diff --git a/mm/gup.c b/mm/gup.c index 59e182634ba8..2493ffa10f4b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. - * - * hugetlb_follow_page_mask is only for follow_page() handling here. - * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, @@ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm, ret = check_vma_flags(vma, gup_flags); if (ret) goto out; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, - &start, &nr_pages, i, - gup_flags, locked); - if (!*locked) { - /* - * We've got a VM_FAULT_RETRY - * and we've lost mmap_lock. - * We must stop here. - */ - BUG_ON(gup_flags & FOLL_NOWAIT); - goto out; - } - continue; - } } retry: /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab52214b5a75..e3839eee4657 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5721,7 +5721,6 @@ out_release_old: /* * Return whether there is a pagecache page to back given address within VMA. - * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. */ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -6422,37 +6421,6 @@ out_release_nounlock: } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages) -{ - int nr; - - for (nr = 0; nr < refs; nr++) { - if (likely(pages)) - pages[nr] = nth_page(page, nr); - } -} - -static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, - unsigned int flags, pte_t *pte, - bool *unshare) -{ - pte_t pteval = huge_ptep_get(pte); - - *unshare = false; - if (is_swap_pte(pteval)) - return true; - if (huge_pte_write(pteval)) - return false; - if (flags & FOLL_WRITE) - return true; - if (gup_must_unshare(vma, flags, pte_page(pteval))) { - *unshare = true; - return true; - } - return false; -} - struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) @@ -6524,198 +6492,6 @@ out_unlock: return page; } -long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, unsigned long *position, - unsigned long *nr_pages, long i, unsigned int flags, - int *locked) -{ - unsigned long pfn_offset; - unsigned long vaddr = *position; - unsigned long remainder = *nr_pages; - struct hstate *h = hstate_vma(vma); - int err = -EFAULT, refs; - - while (vaddr < vma->vm_end && remainder) { - pte_t *pte; - spinlock_t *ptl = NULL; - bool unshare = false; - int absent; - struct page *page; - - /* - * If we have a pending SIGKILL, don't keep faulting pages and - * potentially allocating memory. - */ - if (fatal_signal_pending(current)) { - remainder = 0; - break; - } - - hugetlb_vma_lock_read(vma); - /* - * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make sure we get the - * first, for the page indexing below to work. - * - * Note that page table lock is not held when pte is null. - */ - pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), - huge_page_size(h)); - if (pte) - ptl = huge_pte_lock(h, mm, pte); - absent = !pte || huge_pte_none(huge_ptep_get(pte)); - - /* - * When coredumping, it suits get_dump_page if we just return - * an error where there's an empty slot with no huge pagecache - * to back it. This way, we avoid allocating a hugepage, and - * the sparse dumpfile avoids allocating disk blocks, but its - * huge holes still show up with zeroes where they need to be. - */ - if (absent && (flags & FOLL_DUMP) && - !hugetlbfs_pagecache_present(h, vma, vaddr)) { - if (pte) - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - remainder = 0; - break; - } - - /* - * We need call hugetlb_fault for both hugepages under migration - * (in which case hugetlb_fault waits for the migration,) and - * hwpoisoned hugepages (in which case we need to prevent the - * caller from accessing to them.) In order to do this, we use - * here is_swap_pte instead of is_hugetlb_entry_migration and - * is_hugetlb_entry_hwpoisoned. This is because it simply covers - * both cases, and because we can't follow correct pages - * directly from any kind of swap entries. - */ - if (absent || - __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { - vm_fault_t ret; - unsigned int fault_flags = 0; - - if (pte) - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - - if (flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - else if (unshare) - fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) { - fault_flags |= FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_KILLABLE; - if (flags & FOLL_INTERRUPTIBLE) - fault_flags |= FAULT_FLAG_INTERRUPTIBLE; - } - if (flags & FOLL_NOWAIT) - fault_flags |= FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT; - if (flags & FOLL_TRIED) { - /* - * Note: FAULT_FLAG_ALLOW_RETRY and - * FAULT_FLAG_TRIED can co-exist - */ - fault_flags |= FAULT_FLAG_TRIED; - } - ret = hugetlb_fault(mm, vma, vaddr, fault_flags); - if (ret & VM_FAULT_ERROR) { - err = vm_fault_to_errno(ret, flags); - remainder = 0; - break; - } - if (ret & VM_FAULT_RETRY) { - if (locked && - !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *locked = 0; - *nr_pages = 0; - /* - * VM_FAULT_RETRY must not return an - * error, it will return zero - * instead. - * - * No need to update "position" as the - * caller will not check it after - * *nr_pages is set to 0. - */ - return i; - } - continue; - } - - pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; - page = pte_page(huge_ptep_get(pte)); - - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); - - /* - * If subpage information not requested, update counters - * and skip the same_page loop below. - */ - if (!pages && !pfn_offset && - (vaddr + huge_page_size(h) < vma->vm_end) && - (remainder >= pages_per_huge_page(h))) { - vaddr += huge_page_size(h); - remainder -= pages_per_huge_page(h); - i += pages_per_huge_page(h); - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - continue; - } - - /* vaddr may not be aligned to PAGE_SIZE */ - refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, - (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - - if (pages) - record_subpages(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL); - - if (pages) { - /* - * try_grab_folio() should always succeed here, - * because: a) we hold the ptl lock, and b) we've just - * checked that the huge page is present in the page - * tables. If the huge page is present, then the tail - * pages must also be present. The ptl prevents the - * head page and tail pages from being rearranged in - * any way. As this is hugetlb, the pages will never - * be p2pdma or not longterm pinable. So this page - * must be available at this point, unless the page - * refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, - flags))) { - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - remainder = 0; - err = -ENOMEM; - break; - } - } - - vaddr += (refs << PAGE_SHIFT); - remainder -= refs; - i += refs; - - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - } - *nr_pages = remainder; - /* - * setting position is actually required only if remainder is - * not zero but it's faster not to add a "if (remainder)" - * branch. - */ - *position = vaddr; - - return i ? i : err; -} - long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) -- cgit v1.2.3 From f04d16ee3afc049cdfa99500d95dee8b0eb77cfa Mon Sep 17 00:00:00 2001 From: Haibo Li Date: Wed, 28 Jun 2023 19:02:20 +0800 Subject: mm/filemap.c: fix update prev_pos after one read request done ra->prev_pos tracks the last visited byte in the previous read request. It is used to check whether it is sequential read in ondemand_readahead and thus affects the readahead window. After commit 06c0444290ce ("mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig"), update logic of prev_pos is changed. It updates prev_pos after each return from filemap_get_pages(). But the read request from user may be not fully completed at this point. The updated prev_pos impacts the subsequent readahead window. The real problem is performance drop of fsck_msdos between linux-5.4 and linux-5.15(also linux-6.4). Comparing to linux-5.4,It spends about 110% time and read 140% pages. The read pattern of fsck_msdos is not fully sequential. Simplified read pattern of fsck_msdos likes below: 1.read at page offset 0xa,size 0x1000 2.read at other page offset like 0x20,size 0x1000 3.read at page offset 0xa,size 0x4000 4.read at page offset 0xe,size 0x1000 Here is the read status on linux-6.4: 1.after read at page offset 0xa,size 0x1000 ->page ofs 0xa go into pagecache 2.after read at page offset 0x20,size 0x1000 ->page ofs 0x20 go into pagecache 3.read at page offset 0xa,size 0x4000 ->filemap_get_pages read ofs 0xa from pagecache and returns ->prev_pos is updated to 0xb and goto next loop ->filemap_get_pages tends to read ofs 0xb,size 0x3000 ->initial_readahead case in ondemand_readahead since prev_pos is the same as request ofs. ->read 8 pages while async size is 5 pages (PageReadahead flag at page 0xe) 4.read at page offset 0xe,size 0x1000 ->hit page 0xe with PageReadahead flag set,double the ra_size. read 16 pages while async size is 16 pages Now it reads 24 pages while actually uses 5 pages on linux-5.4: 1.the same as 6.4 2.the same as 6.4 3.read at page offset 0xa,size 0x4000 ->read ofs 0xa from pagecache ->read ofs 0xb,size 0x3000 using page_cache_sync_readahead read 3 pages ->prev_pos is updated to 0xd before generic_file_buffered_read returns 4.read at page offset 0xe,size 0x1000 ->initial_readahead case in ondemand_readahead since request ofs-prev_pos==1 ->read 4 pages while async size is 3 pages Now it reads 7 pages while actually uses 5 pages. In above demo, the initial_readahead case is triggered by offset of user request on linux-5.4. While it may be triggered by update logic of prev_pos on linux-6.4. To fix the performance drop, update prev_pos after finishing one read request. Link: https://lkml.kernel.org/r/20230628110220.120134-1-haibo.li@mediatek.com Signed-off-by: Haibo Li Reviewed-by: Jan Kara Cc: AngeloGioacchino Del Regno Cc: Matthew Wilcox Cc: Matthias Brugger Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c5e2c70ea046..93e495d2d477 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2632,6 +2632,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, int i, error = 0; bool writably_mapped; loff_t isize, end_offset; + loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; @@ -2682,8 +2683,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, * When a read accesses the same folio several times, only * mark it as accessed the first time. */ - if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1, - fbatch.folios[0])) + if (!pos_same_folio(iocb->ki_pos, last_pos - 1, + fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { @@ -2710,7 +2711,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, already_read += copied; iocb->ki_pos += copied; - ra->prev_pos = iocb->ki_pos; + last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; @@ -2724,7 +2725,7 @@ put_folios: } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); - + ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); -- cgit v1.2.3 From b7b618da0edc85280e1c9c8f4f5239571e7c1d3e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 28 Jun 2023 09:49:29 +0800 Subject: mm: memory-failure: remove unneeded page state check in shake_page() Remove unneeded PageLRU(p) and is_free_buddy_page(p) check as slab caches are not shrunk now. This check can be added back when a lightweight range based shrinker is available. Link: https://lkml.kernel.org/r/20230628014929.3441386-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ece5d481b5ff..1ddb25a1073e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -363,17 +363,14 @@ void shake_page(struct page *p) { if (PageHuge(p)) return; - - if (!PageSlab(p)) { - lru_add_drain_all(); - if (PageLRU(p) || is_free_buddy_page(p)) - return; - } - /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ + if (PageSlab(p)) + return; + + lru_add_drain_all(); } EXPORT_SYMBOL_GPL(shake_page); -- cgit v1.2.3 From f58d7907a39f8d0381db56af4a590915fd94ee58 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 26 Jun 2023 20:10:53 +0800 Subject: memory tier: use helper function destroy_memory_type() Use helper function destroy_memory_type() to release memtype instead of open code it to help improve code readability a bit. No functional change intended. Link: https://lkml.kernel.org/r/20230626121053.1916447-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index a516e303e304..1719fa3bcf02 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) */ if (!node_memory_types[node].map_count) { node_memory_types[node].memtype = NULL; - kref_put(&memtype->kref, release_memtype); + destroy_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } -- cgit v1.2.3 From 1a7d018dc38b6851c602b448bdac2e78b46857db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 26 Jun 2023 19:43:43 +0800 Subject: mm: memory-failure: remove unneeded 'inline' annotation Remove unneeded 'inline' annotation from num_poisoned_pages_inc() and num_poisoned_pages_sub(). No functional change intended. Link: https://lkml.kernel.org/r/20230626114343.1846587-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1ddb25a1073e..9d87f0b8b805 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -75,13 +75,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -inline void num_poisoned_pages_inc(unsigned long pfn) +void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); memblk_nr_poison_inc(pfn); } -inline void num_poisoned_pages_sub(unsigned long pfn, long i) +void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); if (pfn != -1UL) -- cgit v1.2.3 From 416ef04fe00c5f2f6fb8e13d8dbe1b5a0a274f83 Mon Sep 17 00:00:00 2001 From: liuq Date: Sun, 25 Jun 2023 11:16:56 +0800 Subject: mm/page_alloc: fix min_free_kbytes calculation regarding ZONE_MOVABLE The current calculation of min_free_kbytes only uses ZONE_DMA and ZONE_NORMAL pages,but the ZONE_MOVABLE zone->_watermark[WMARK_MIN] will also divide part of min_free_kbytes.This will cause the min watermark of ZONE_NORMAL to be too small in the presence of ZONE_MOVEABLE. __GFP_HIGH and PF_MEMALLOC allocations usually don't need movable zone pages, so just like ZONE_HIGHMEM, cap pages_min to a small value in __setup_per_zone_wmarks(). On my testing machine with 16GB of memory (transparent hugepage is turned off by default, and movablecore=12G is configured) The following is a comparative test data of watermark_min no patch add patch ZONE_DMA 1 8 ZONE_DMA32 151 709 ZONE_NORMAL 233 1113 ZONE_MOVABLE 1434 128 min_free_kbytes 7288 7326 Link: https://lkml.kernel.org/r/20230625031656.23941-1-liuq131@chinatelecom.cn Signed-off-by: liuq Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d3460c7a480..fe04c4e85c42 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5694,9 +5694,9 @@ static void __setup_per_zone_wmarks(void) struct zone *zone; unsigned long flags; - /* Calculate total number of !ZONE_HIGHMEM pages */ + /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */ for_each_zone(zone) { - if (!is_highmem(zone)) + if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE) lowmem_pages += zone_managed_pages(zone); } @@ -5706,15 +5706,15 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); - if (is_highmem(zone)) { + if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't - * need highmem pages, so cap pages_min to a small - * value here. + * need highmem and movable zones pages, so cap pages_min + * to a small value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas control async page reclaim, and so should - * not be capped for highmem. + * not be capped for highmem and movable zones. */ unsigned long min_pages; -- cgit v1.2.3 From 3fade62b62e84dd8dbf6e92d494b0e7eca750c43 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 25 Jun 2023 10:13:23 +0800 Subject: mm/mm_init.c: remove obsolete macro HASH_SMALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HASH_SMALL only works when parameter numentries is 0. But the sole caller futex_init() never calls alloc_large_system_hash() with numentries set to 0. So HASH_SMALL is obsolete and remove it. Link: https://lkml.kernel.org/r/20230625021323.849147-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport (IBM) Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Miaohe Lin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 +--- kernel/futex/core.c | 3 +-- mm/mm_init.c | 10 +--------- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f71ff9f0ec81..0d031fbfea25 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -581,9 +581,7 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned long high_limit); #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ -#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min - * shift passed via *_hash_shift */ -#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ +#define HASH_ZERO 0x00000002 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 514e4582b863..f10587d1d481 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1132,8 +1132,7 @@ static int __init futex_init(void) #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, - futex_hashsize < 256 ? HASH_SMALL : 0, + futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; diff --git a/mm/mm_init.c b/mm/mm_init.c index d356ba59ef2a..f90db54e2b21 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2489,15 +2489,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); - /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely(flags & HASH_SMALL)) { - /* Makes no sense without HASH_EARLY */ - WARN_ON(!(flags & HASH_EARLY)); - if (!(numentries >> *_hash_shift)) { - numentries = 1UL << *_hash_shift; - BUG_ON(!numentries); - } - } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); -- cgit v1.2.3 From df9cd3cbf209b9357f2939ba63afa427026e954b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 24 Jun 2023 14:12:14 +0900 Subject: zsmalloc: do not scan for allocated objects in empty zspage Patch series "zsmalloc: small compaction improvements", v2. A tiny series that can reduce the number of find_alloced_obj() invocations (which perform a linear scan of sub-page) during compaction. Inspired by Alexey Romanov's findings. This patch (of 3): zspage migration can terminate as soon as it moves the last allocated object from the source zspage. Add a simple helper zspage_empty() that tests zspage ->inuse on each migration iteration. Link: https://lkml.kernel.org/r/20230624053120.643409-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Alexey Romanov Reviewed-by: Alexey Romanov Acked-by: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 3f057970504e..5d60eaedc3b7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1147,6 +1147,11 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +static bool zspage_empty(struct zspage *zspage) +{ + return get_zspage_inuse(zspage) == 0; +} + /** * zs_lookup_class_index() - Returns index of the zsmalloc &size_class * that hold objects of the provided size. @@ -1625,6 +1630,10 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); + + /* Stop if there are no more objects to migrate */ + if (zspage_empty(get_zspage(s_page))) + break; } /* Remember last position in this iteration */ -- cgit v1.2.3 From 4ce36584da19ff6a5d171a317c6c34c567d4628e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 24 Jun 2023 14:12:15 +0900 Subject: zsmalloc: move migration destination zspage inuse check Destination zspage fullness check need to be done after zs_object_copy() because that's where source and destination zspages fullness change. Checking destination zspage fullness before zs_object_copy() may cause migration to loop through source zspage sub-pages scanning for allocate objects just to find out at the end that the destination zspage is full. Link: https://lkml.kernel.org/r/20230624053120.643409-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d60eaedc3b7..4a84f7877669 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1620,10 +1620,6 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, continue; } - /* Stop if there is no more space */ - if (zspage_full(class, get_zspage(d_page))) - break; - used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); @@ -1631,6 +1627,10 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, record_obj(handle, free_obj); obj_free(class->size, used_obj); + /* Stop if there is no more space */ + if (zspage_full(class, get_zspage(d_page))) + break; + /* Stop if there are no more objects to migrate */ if (zspage_empty(get_zspage(s_page))) break; -- cgit v1.2.3 From ada5caed79b313b3046839c9de9bf9048561e4bb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 24 Jun 2023 14:12:16 +0900 Subject: zsmalloc: remove zs_compact_control __zs_compact always putback src_zspage into class list after migrate_zspage. Thus, we don't need to keep last position of src_zspage any more. Let's remove it. Link: https://lkml.kernel.org/r/20230624053120.643409-4-senozhatsky@chromium.org Signed-off-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Acked-by: Sergey Senozhatsky Cc: Alexey Romanov Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4a84f7877669..84beadc088b8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1590,25 +1590,14 @@ static unsigned long find_alloced_obj(struct size_class *class, return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); } -struct zs_compact_control { - /* Source spage for migration which could be a subpage of zspage */ - struct page *s_page; - /* Destination page for migration which should be a first page - * of zspage. */ - struct page *d_page; - /* Starting object index within @s_page which used for live object - * in the subpage. */ - int obj_idx; -}; - -static void migrate_zspage(struct zs_pool *pool, struct size_class *class, - struct zs_compact_control *cc) +static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, + struct zspage *dst_zspage) { unsigned long used_obj, free_obj; unsigned long handle; - struct page *s_page = cc->s_page; - struct page *d_page = cc->d_page; - int obj_idx = cc->obj_idx; + int obj_idx = 0; + struct page *s_page = get_first_page(src_zspage); + struct size_class *class = pool->size_class[src_zspage->class]; while (1) { handle = find_alloced_obj(class, s_page, &obj_idx); @@ -1621,24 +1610,20 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(pool, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, dst_zspage, handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); /* Stop if there is no more space */ - if (zspage_full(class, get_zspage(d_page))) + if (zspage_full(class, dst_zspage)) break; /* Stop if there are no more objects to migrate */ - if (zspage_empty(get_zspage(s_page))) + if (zspage_empty(src_zspage)) break; } - - /* Remember last position in this iteration */ - cc->s_page = s_page; - cc->obj_idx = obj_idx; } static struct zspage *isolate_src_zspage(struct size_class *class) @@ -2013,7 +1998,6 @@ static unsigned long zs_can_compact(struct size_class *class) static unsigned long __zs_compact(struct zs_pool *pool, struct size_class *class) { - struct zs_compact_control cc; struct zspage *src_zspage = NULL; struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; @@ -2031,7 +2015,6 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (!dst_zspage) break; migrate_write_lock(dst_zspage); - cc.d_page = get_first_page(dst_zspage); } src_zspage = isolate_src_zspage(class); @@ -2040,9 +2023,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, migrate_write_lock_nested(src_zspage); - cc.obj_idx = 0; - cc.s_page = get_first_page(src_zspage); - migrate_zspage(pool, class, &cc); + migrate_zspage(pool, src_zspage, dst_zspage); fg = putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); -- cgit v1.2.3 From a2ebb51575828209b3e9d6f3c6576f7a7c70d0f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 23 Jun 2023 22:15:17 +0200 Subject: mm/page_alloc: use write_seqlock_irqsave() instead write_seqlock() + local_irq_save(). __build_all_zonelists() acquires zonelist_update_seq by first disabling interrupts via local_irq_save() and then acquiring the seqlock with write_seqlock(). This is troublesome and leads to problems on PREEMPT_RT. The problem is that the inner spinlock_t becomes a sleeping lock on PREEMPT_RT and must not be acquired with disabled interrupts. The API provides write_seqlock_irqsave() which does the right thing in one step. printk_deferred_enter() has to be invoked in non-migrate-able context to ensure that deferred printing is enabled and disabled on the same CPU. This is the case after zonelist_update_seq has been acquired. There was discussion on the first submission that the order should be: local_irq_disable(); printk_deferred_enter(); write_seqlock(); to avoid pitfalls like having an unaccounted printk() coming from write_seqlock_irqsave() before printk_deferred_enter() is invoked. The only origin of such a printk() can be a lockdep splat because the lockdep annotation happens after the sequence count is incremented. This is exceptional and subject to change. It was also pointed that PREEMPT_RT can be affected by the printk problem since its write_seqlock_irqsave() does not really disable interrupts. This isn't the case because PREEMPT_RT's printk implementation differs from the mainline implementation in two important aspects: - Printing happens in a dedicated threads and not at during the invocation of printk(). - In emergency cases where synchronous printing is used, a different driver is used which does not use tty_port::lock. Acquire zonelist_update_seq with write_seqlock_irqsave() and then defer printk output. Link: https://lkml.kernel.org/r/20230623201517.yw286Knb@linutronix.de Fixes: 1007843a91909 ("mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Cc: Boqun Feng Cc: Ingo Molnar Cc: John Ogness Cc: Luis Claudio R. Goncalves Cc: Mel Gorman Cc: Peter Zijlstra Cc: Petr Mladek Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe04c4e85c42..b51bbc485a28 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5139,19 +5139,17 @@ static void __build_all_zonelists(void *data) unsigned long flags; /* - * Explicitly disable this CPU's interrupts before taking seqlock - * to prevent any IRQ handler from calling into the page allocator - * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + * The zonelist_update_seq must be acquired with irqsave because the + * reader can be invoked from IRQ with GFP_ATOMIC. */ - local_irq_save(flags); + write_seqlock_irqsave(&zonelist_update_seq, flags); /* - * Explicitly disable this CPU's synchronous printk() before taking - * seqlock to prevent any printk() from trying to hold port->lock, for + * Also disable synchronous printk() to prevent any printk() from + * trying to hold port->lock, for * tty_insert_flip_string_and_push_buffer() on other CPU might be * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. */ printk_deferred_enter(); - write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5188,9 +5186,8 @@ static void __build_all_zonelists(void *data) #endif } - write_sequnlock(&zonelist_update_seq); printk_deferred_exit(); - local_irq_restore(flags); + write_sequnlock_irqrestore(&zonelist_update_seq, flags); } static noinline void __init -- cgit v1.2.3 From df263d9a7dffee94ca5391120ee3b0587efa07f1 Mon Sep 17 00:00:00 2001 From: Mika Penttilä Date: Wed, 7 Jun 2023 20:29:44 +0300 Subject: mm/migrate_device: try to handle swapcache pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrating file pages and swapcache pages into device memory is not supported. Try to get rid of the swap cache, and if successful, go ahead as with other anonymous pages. Link: https://lkml.kernel.org/r/20230607172944.11713-1-mpenttil@redhat.com Signed-off-by: Mika Penttilä Reviewed-by: "Huang, Ying" Reviewed-by: Alistair Popple Cc: John Hubbard Cc: Ralph Campbell Signed-off-by: Andrew Morton --- mm/migrate_device.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8365158460ed..e29626e1329e 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -727,13 +727,22 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (is_device_private_page(newpage) || is_device_coherent_page(newpage)) { - /* - * For now only support anonymous memory migrating to - * device private or coherent memory. - */ if (mapping) { - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + struct folio *folio; + + folio = page_folio(page); + + /* + * For now only support anonymous memory migrating to + * device private or coherent memory. + * + * Try to get rid of swap cache if possible. + */ + if (!folio_test_anon(folio) || + !folio_free_swap(folio)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } } } else if (is_zone_device_page(newpage)) { /* -- cgit v1.2.3 From 79271476b3362a9e69adae949a520647f8af3559 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:28 +0800 Subject: ksm: support unsharing KSM-placed zero pages Patch series "ksm: support tracking KSM-placed zero-pages", v10. The core idea of this patch set is to enable users to perceive the number of any pages merged by KSM, regardless of whether use_zero_page switch has been turned on, so that users can know how much free memory increase is really due to their madvise(MERGEABLE) actions. But the problem is, when enabling use_zero_pages, all empty pages will be merged with kernel zero pages instead of with each other as use_zero_pages is disabled, and then these zero-pages are no longer monitored by KSM. The motivations to do this is seen at: https://lore.kernel.org/lkml/202302100915227721315@zte.com.cn/ In one word, we hope to implement the support for KSM-placed zero pages tracking without affecting the feature of use_zero_pages, so that app developer can also benefit from knowing the actual KSM profit by getting KSM-placed zero pages to optimize applications eventually when /sys/kernel/mm/ksm/use_zero_pages is enabled. This patch (of 5): When use_zero_pages of ksm is enabled, madvise(addr, len, MADV_UNMERGEABLE) and other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing will *not* actually unshare the shared zeropage as placed by KSM (which is against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages are out of the control of KSM, the related counts of ksm pages don't expose how many zero pages are placed by KSM (these special zero pages are different from those initially mapped zero pages, because the zero pages mapped to MADV_UNMERGEABLE areas are expected to be a complete and unshared page). To not blindly unshare all shared zero_pages in applicable VMAs, the patch use pte_mkdirty (related with architecture) to mark KSM-placed zero pages. Thus, MADV_UNMERGEABLE will only unshare those KSM-placed zero pages. In addition, we'll reuse this mechanism to reliably identify KSM-placed ZeroPages to properly account for them (e.g., calculating the KSM profit that includes zeropages) in the latter patches. The patch will not degrade the performance of use_zero_pages as it doesn't change the way of merging empty pages in use_zero_pages's feature. Link: https://lkml.kernel.org/r/202306131104554703428@zte.com.cn Link: https://lkml.kernel.org/r/20230613030928.185882-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 ++++++ mm/ksm.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 899a314bc487..98878107244f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -26,6 +26,12 @@ int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); +/* + * To identify zeropages that were mapped by KSM, we reuse the dirty bit + * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when + * deduplicating memory. + */ +#define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { diff --git a/mm/ksm.c b/mm/ksm.c index ba266359da55..99519e22a761 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -448,7 +448,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex if (is_migration_entry(entry)) page = pfn_swap_entry_to_page(entry); } - ret = page && PageKsm(page); + /* return 1 if the page is an normal ksm page or KSM-placed zero page */ + ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); pte_unmap_unlock(pte, ptl); return ret; } @@ -1222,8 +1223,12 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { - newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), - vma->vm_page_prot)); + /* + * Use pte_mkdirty to mark the zero page mapped by KSM, and then + * we can easily track all KSM-placed zero pages by checking if + * the dirty bit in zero page's PTE is set. + */ + newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we -- cgit v1.2.3 From e2942062e01df85b4692460fe5b48ab0c90fdb95 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:34 +0800 Subject: ksm: count all zero pages placed by KSM As pages_sharing and pages_shared don't include the number of zero pages merged by KSM, we cannot know how many pages are zero pages placed by KSM when enabling use_zero_pages, which leads to KSM not being transparent with all actual merged pages by KSM. In the early days of use_zero_pages, zero-pages was unable to get unshared by the ways like MADV_UNMERGEABLE so it's hard to count how many times one of those zeropages was then unmerged. But now, unsharing KSM-placed zero page accurately has been achieved, so we can easily count both how many times a page full of zeroes was merged with zero-page and how many times one of those pages was then unmerged. and so, it helps to estimate memory demands when each and every shared page could get unshared. So we add ksm_zero_pages under /sys/kernel/mm/ksm/ to show the number of all zero pages placed by KSM. Meanwhile, we update the Documentation. Link: https://lkml.kernel.org/r/20230613030934.185944-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 7 +++++++ include/linux/ksm.h | 12 ++++++++++++ mm/khugepaged.c | 2 ++ mm/ksm.c | 12 ++++++++++++ mm/memory.c | 5 ++++- 5 files changed, 37 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 7626392fe82c..6cc919dbfd55 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages +ksm_zero_pages + how many zero pages that are still mapped into processes were mapped by + KSM when deduplicating. + +When ``use_zero_pages`` is/was enabled, the sum of ``pages_sharing`` + +``ksm_zero_pages`` represents the actual number of pages saved by KSM. +if ``use_zero_pages`` has never been enabled, ``ksm_zero_pages`` is 0. A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 98878107244f..e80aa49009b2 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,6 +33,14 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) +extern unsigned long ksm_zero_pages; + +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ + if (is_ksm_zero_pte(pte)) + ksm_zero_pages--; +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { int ret; @@ -101,6 +109,10 @@ static inline void ksm_exit(struct mm_struct *mm) { } +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ +} + #ifdef CONFIG_MEMORY_FAILURE static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 78c8d5d8b628..419981dcc889 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -709,6 +710,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); + ksm_might_unmap_zero_page(pteval); } } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c index 99519e22a761..e037d9aad691 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* The number of zero pages which is placed by KSM */ +unsigned long ksm_zero_pages; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -1229,6 +1232,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); + ksm_zero_pages++; /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we @@ -3356,6 +3360,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t ksm_zero_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%ld\n", ksm_zero_pages); +} +KSM_ATTR_RO(ksm_zero_pages); + static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3423,6 +3434,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/memory.c b/mm/memory.c index e9f9944c7370..c256da05bb5e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - if (unlikely(!page)) + if (unlikely(!page)) { + ksm_might_unmap_zero_page(ptent); continue; + } delay_rmap = 0; if (!PageAnon(page)) { @@ -3128,6 +3130,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } } else { + ksm_might_unmap_zero_page(vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); -- cgit v1.2.3 From 6080d19f07043ade61094d0f58b14c05e1694a39 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:38 +0800 Subject: ksm: add ksm zero pages for each process As the number of ksm zero pages is not included in ksm_merging_pages per process when enabling use_zero_pages, it's unclear of how many actual pages are merged by KSM. To let users accurately estimate their memory demands when unsharing KSM zero-pages, it's necessary to show KSM zero- pages per process. In addition, it help users to know the actual KSM profit because KSM-placed zero pages are also benefit from KSM. since unsharing zero pages placed by KSM accurately is achieved, then tracking empty pages merging and unmerging is not a difficult thing any longer. Since we already have /proc//ksm_stat, just add the information of 'ksm_zero_pages' in it. Link: https://lkml.kernel.org/r/20230613030938.185993-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- fs/proc/base.c | 1 + include/linux/ksm.h | 8 +++++--- include/linux/mm_types.h | 9 +++++++-- mm/khugepaged.c | 2 +- mm/ksm.c | 1 + mm/memory.c | 4 ++-- 6 files changed, 17 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872..eb2e498e3b8d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e80aa49009b2..c2dd786a30e1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -35,10 +35,12 @@ void __ksm_exit(struct mm_struct *mm); extern unsigned long ksm_zero_pages; -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { - if (is_ksm_zero_pte(pte)) + if (is_ksm_zero_pte(pte)) { ksm_zero_pages--; + mm->ksm_zero_pages--; + } } static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -109,7 +111,7 @@ static inline void ksm_exit(struct mm_struct *mm) { } -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e74ce4a28cd..51d04c1847c1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -812,7 +812,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* @@ -820,7 +820,12 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; -#endif + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages; +#endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN struct { /* this mm_struct is on lru_gen_mm_list */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 419981dcc889..4b8b8673d5d9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -710,7 +710,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); - ksm_might_unmap_zero_page(pteval); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c index e037d9aad691..e1772081e8cb 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1233,6 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_zero_pages++; + mm->ksm_zero_pages++; /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we diff --git a/mm/memory.c b/mm/memory.c index c256da05bb5e..5f863b1a0edc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1434,7 +1434,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); if (unlikely(!page)) { - ksm_might_unmap_zero_page(ptent); + ksm_might_unmap_zero_page(mm, ptent); continue; } @@ -3130,7 +3130,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } } else { - ksm_might_unmap_zero_page(vmf->orig_pte); + ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); -- cgit v1.2.3 From 1a8e84305783bddbae708f28178c6d0aa6321913 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:42 +0800 Subject: ksm: consider KSM-placed zeropages when calculating KSM profit When use_zero_pages is enabled, the calculation of ksm profit is not correct because ksm zero pages is not counted in. So update the calculation of KSM profit including the documentation. Link: https://lkml.kernel.org/r/20230613030942.186041-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Xiaokai Ran Cc: Yang Yang Cc: Jiang Xuexin Cc: Claudio Imbrenda Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 18 +++++++++++------- mm/ksm.c | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 6cc919dbfd55..5c5be7bd84b8 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -203,21 +203,25 @@ several times, which are unprofitable memory consumed. 1) How to determine whether KSM save memory or consume memory in system-wide range? Here is a simple approximate calculation for reference:: - general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * + general_profit =~ ksm_saved_pages * sizeof(page) - (all_rmap_items) * sizeof(rmap_item); - where all_rmap_items can be easily obtained by summing ``pages_sharing``, - ``pages_shared``, ``pages_unshared`` and ``pages_volatile``. + where ksm_saved_pages equals to the sum of ``pages_sharing`` + + ``ksm_zero_pages`` of the system, and all_rmap_items can be easily + obtained by summing ``pages_sharing``, ``pages_shared``, ``pages_unshared`` + and ``pages_volatile``. 2) The KSM profit inner a single process can be similarly obtained by the following approximate calculation:: - process_profit =~ ksm_merging_pages * sizeof(page) - + process_profit =~ ksm_saved_pages * sizeof(page) - ksm_rmap_items * sizeof(rmap_item). - where ksm_merging_pages is shown under the directory ``/proc//``, - and ksm_rmap_items is shown in ``/proc//ksm_stat``. The process profit - is also shown in ``/proc//ksm_stat`` as ksm_process_profit. + where ksm_saved_pages equals to the sum of ``ksm_merging_pages`` and + ``ksm_zero_pages``, both of which are shown under the directory + ``/proc//ksm_stat``, and ksm_rmap_items is also shown in + ``/proc//ksm_stat``. The process profit is also shown in + ``/proc//ksm_stat`` as ksm_process_profit. From the perspective of application, a high ratio of ``ksm_rmap_items`` to ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or diff --git a/mm/ksm.c b/mm/ksm.c index e1772081e8cb..97a9627116fa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3092,7 +3092,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { - return mm->ksm_merging_pages * PAGE_SIZE - + return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ @@ -3373,7 +3373,7 @@ static ssize_t general_profit_show(struct kobject *kobj, { long general_profit; - general_profit = ksm_pages_sharing * PAGE_SIZE - + general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); -- cgit v1.2.3 From 82d9b8c85b7e4bd85c679ac2da26b57224c4999d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 Jul 2023 19:18:23 +0800 Subject: mm: page_alloc: avoid false page outside zone error info If pfn is outside zone boundaries in the first round, ret will be set to 1. But if pfn is changed to inside the zone boundaries in zone span seqretry path, ret is still set to 1 leading to false page outside zone error info. This is from code inspection. The race window should be really small thus hard to trigger in real world. [akpm@linux-foundation.org: code simplification, per Matthew] Link: https://lkml.kernel.org/r/20230704111823.940331-1-linmiaohe@huawei.com Fixes: bdc8cb984576 ("[PATCH] memory hotplug locking: zone span seqlock") Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b51bbc485a28..1eb3864e1dbc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -459,7 +459,7 @@ void set_pageblock_migratetype(struct page *page, int migratetype) #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { - int ret = 0; + int ret; unsigned seq; unsigned long pfn = page_to_pfn(page); unsigned long sp, start_pfn; @@ -468,8 +468,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) seq = zone_span_seqbegin(zone); start_pfn = zone->zone_start_pfn; sp = zone->spanned_pages; - if (!zone_spans_pfn(zone, pfn)) - ret = 1; + ret = !zone_spans_pfn(zone, pfn); } while (zone_span_seqretry(zone, seq)); if (ret) -- cgit v1.2.3 From 86327e8eb94c52eca4f93cfece2e29d1bf52acbf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 4 Jul 2023 13:52:40 +0200 Subject: memcg: drop kmem.limit_in_bytes kmem.limit_in_bytes (v1 way to limit kernel memory usage) has been deprecated since 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes") merged in 5.16. We haven't heard about any serious users since then but it seems that the mere presence of the file is causing more harm thatn good. We (SUSE) have had several bug reports from customers where Docker based containers started to fail because a write to kmem.limit_in_bytes has failed. This was unexpected because runc code only expects ENOENT (kmem disabled) or EBUSY (tasks already running within cgroup). So a new error code was unexpected and the whole container startup failed. This has been later addressed by https://github.com/opencontainers/runc/commit/52390d68040637dfc77f9fda6bbe70952423d380 so current Docker runtimes do not suffer from the problem anymore. There are still older version of Docker in use and likely hard to get rid of completely. Address this by wiping out the file completely and effectively get back to pre 4.5 era and CONFIG_MEMCG_KMEM=n configuration. I would recommend backporting to stable trees which have picked up 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes"). [mhocko@suse.com: restore _KMEM switch case] Link: https://lkml.kernel.org/r/ZKe5wxdbvPi5Cwd7@dhcp22.suse.cz Link: https://lkml.kernel.org/r/20230704115240.14672-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Muchun Song Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 -- mm/memcontrol.c | 10 ---------- 2 files changed, 12 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index fabaad3fd9c2..8d3afeede10e 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -92,8 +92,6 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes This knob is deprecated and writing to - it will return -ENOTSUPP. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8ca4bdcb03c..ab99503c9ff2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3871,10 +3871,6 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _MEMSWAP: ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; - case _KMEM: - /* kmem.limit_in_bytes is deprecated. */ - ret = -EOPNOTSUPP; - break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); break; @@ -5085,12 +5081,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif - { - .name = "kmem.limit_in_bytes", - .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write = mem_cgroup_write, - .read_u64 = mem_cgroup_read_u64, - }, { .name = "kmem.usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), -- cgit v1.2.3 From 72de259130229412ca49871e70ffaf17dc9fba98 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 5 Jul 2023 06:33:14 +0000 Subject: mm/memfd: sysctl: fix MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED Patch series "mm/memfd: fix sysctl MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED", v2. When sysctl vm.memfd_noexec is 2 (MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED), memfd_create(.., MFD_EXEC) should fail. This complies with how MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED is defined - "memfd_create() without MFD_NOEXEC_SEAL will be rejected" Thanks to Dominique Martinet who reported the bug. see [1] for context. [1] https://lore.kernel.org/linux-mm/CABi2SkXUX_QqTQ10Yx9bBUGpN1wByOi_=gZU6WEy5a8MaQY3Jw@mail.gmail.com/T/ This patch (of 2): When vm.memfd_noexec is 2 (MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED), memfd_create(.., MFD_EXEC) should fail. This complies with how MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED is defined - "memfd_create() without MFD_NOEXEC_SEAL will be rejected" Link: https://lkml.kernel.org/r/20230705063315.3680666-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20230705063315.3680666-2-jeffxu@google.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Reported-by: Dominique Martinet Closes: https://lore.kernel.org/linux-mm/CABi2SkXUX_QqTQ10Yx9bBUGpN1wByOi_=gZU6WEy5a8MaQY3Jw@mail.gmail.com/T/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306301351.kkbSegQW-lkp@intel.com/ Signed-off-by: Jeff Xu Cc: Daniel Verkamp Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Shuah Khan Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/memfd.c | 57 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/memfd.c b/mm/memfd.c index e763e76f1106..0bdbd2335af7 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -268,6 +268,36 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) +static int check_sysctl_memfd_noexec(unsigned int *flags) +{ +#ifdef CONFIG_SYSCTL + char comm[TASK_COMM_LEN]; + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + if (ns) + sysctl = ns->memfd_noexec_scope; + + if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { + if (sysctl == MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) + *flags |= MFD_NOEXEC_SEAL; + else + *flags |= MFD_EXEC; + } + + if (*flags & MFD_EXEC && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { + pr_warn_once( + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + + return -EACCES; + } +#endif + + return 0; +} + SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) @@ -294,35 +324,14 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { -#ifdef CONFIG_SYSCTL - int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; - struct pid_namespace *ns; - - ns = task_active_pid_ns(current); - if (ns) - sysctl = ns->memfd_noexec_scope; - - switch (sysctl) { - case MEMFD_NOEXEC_SCOPE_EXEC: - flags |= MFD_EXEC; - break; - case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL: - flags |= MFD_NOEXEC_SEAL; - break; - default: - pr_warn_once( - "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); - return -EINVAL; - } -#else - flags |= MFD_EXEC; -#endif pr_warn_once( "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", task_pid_nr(current), get_task_comm(comm, current)); } + if (check_sysctl_memfd_noexec(&flags) < 0) + return -EACCES; + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) -- cgit v1.2.3 From bded67f81ec47e6054ad24c1c7992a6523a9b2c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 14:39:05 +0800 Subject: memory tier: rename destroy_memory_type() to put_memory_type() It appears that destroy_memory_type() isn't a very good name because we usually will not free the memory_type here. So rename it to a more appropriate name i.e. put_memory_type(). Link: https://lkml.kernel.org/r/20230706063905.543800-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Huang, Ying Reviewed-by: "Huang, Ying" Reviewed-by: Xiao Yang Cc: Dan Williams Cc: Dave Jiang Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/dax/kmem.c | 4 ++-- include/linux/memory-tiers.h | 4 ++-- mm/memory-tiers.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 898ca9505754..c57acb73e3db 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -264,7 +264,7 @@ static int __init dax_kmem_init(void) return rc; error_dax_driver: - destroy_memory_type(dax_slowmem_type); + put_memory_type(dax_slowmem_type); err_dax_slowmem_type: kfree_const(kmem_name); return rc; @@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void) dax_driver_unregister(&device_dax_kmem_driver); if (!any_hotremove_failed) kfree_const(kmem_name); - destroy_memory_type(dax_slowmem_type); + put_memory_type(dax_slowmem_type); } MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index fc9647b1b4f9..437441cdf78f 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -33,7 +33,7 @@ struct memory_dev_type { #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; struct memory_dev_type *alloc_memory_type(int adistance); -void destroy_memory_type(struct memory_dev_type *memtype); +void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION @@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int adistance) return NULL; } -static inline void destroy_memory_type(struct memory_dev_type *memtype) +static inline void put_memory_type(struct memory_dev_type *memtype) { } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 1719fa3bcf02..c49ab03f49b1 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance) } EXPORT_SYMBOL_GPL(alloc_memory_type); -void destroy_memory_type(struct memory_dev_type *memtype) +void put_memory_type(struct memory_dev_type *memtype) { kref_put(&memtype->kref, release_memtype); } -EXPORT_SYMBOL_GPL(destroy_memory_type); +EXPORT_SYMBOL_GPL(put_memory_type); void init_node_memory_type(int node, struct memory_dev_type *memtype) { @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) */ if (!node_memory_types[node].map_count) { node_memory_types[node].memtype = NULL; - destroy_memory_type(memtype); + put_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } -- cgit v1.2.3 From 35fb4764c8b2cdd91820761eb03c18106d46b8ae Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Fri, 7 Jul 2023 00:03:34 +0530 Subject: mm: cma: print cma name as well in cma_alloc debug CMA allocation can happen either from global cma or from dedicated cma region. Thus it is helpful to print cma name as well during initial debugging to confirm cma regions were getting initialized or not. Link: https://lkml.kernel.org/r/1688668414-12350-1-git-send-email-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index a4cfe995e11e..4880f72102fa 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -436,8 +436,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, - count, align); + pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, + (void *)cma, cma->name, count, align); if (!count) goto out; -- cgit v1.2.3 From dba438bd7663fefab870a6dd4b01ed0923c32d79 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Jul 2023 20:52:51 +0100 Subject: rmap: pass the folio to __page_check_anon_rmap() The lone caller already has the folio, so pass it in instead of deriving it from the page again. Link: https://lkml.kernel.org/r/20230706195251.2707542-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 0c0d8857dfce..2668f5ea3534 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1175,14 +1175,14 @@ out: /** * __page_check_anon_rmap - sanity check anonymous rmap addition - * @page: the page to add the mapping to + * @folio: The folio containing @page. + * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ -static void __page_check_anon_rmap(struct page *page, +static void __page_check_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1262,7 +1262,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); else - __page_check_anon_rmap(page, vma, address); + __page_check_anon_rmap(folio, page, vma, address); } mlock_vma_folio(folio, vma, compound); -- cgit v1.2.3 From 0201ebf274a306a6ebb95e5dc2d6a0a27c737cac Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:51 +0100 Subject: mm: merge folio_has_private()/filemap_release_folio() call pairs Patch series "mm, netfs, fscache: Stop read optimisation when folio removed from pagecache", v7. This fixes an optimisation in fscache whereby we don't read from the cache for a particular file until we know that there's data there that we don't have in the pagecache. The problem is that I'm no longer using PG_fscache (aka PG_private_2) to indicate that the page is cached and so I don't get a notification when a cached page is dropped from the pagecache. The first patch merges some folio_has_private() and filemap_release_folio() pairs and introduces a helper, folio_needs_release(), to indicate if a release is required. The second patch is the actual fix. Following Willy's suggestions[1], it adds an AS_RELEASE_ALWAYS flag to an address_space that will make filemap_release_folio() always call ->release_folio(), even if PG_private/PG_private_2 aren't set. folio_needs_release() is altered to add a check for this. This patch (of 2): Make filemap_release_folio() check folio_has_private(). Then, in most cases, where a call to folio_has_private() is immediately followed by a call to filemap_release_folio(), we can get rid of the test in the pair. There are a couple of sites in mm/vscan.c that this can't so easily be done. In shrink_folio_list(), there are actually three cases (something different is done for incompletely invalidated buffers), but filemap_release_folio() elides two of them. In shrink_active_list(), we don't have have the folio lock yet, so the check allows us to avoid locking the page unnecessarily. A wrapper function to check if a folio needs release is provided for those places that still need to do it in the mm/ directory. This will acquire additional parts to the condition in a future patch. After this, the only remaining caller of folio_has_private() outside of mm/ is a check in fuse. Link: https://lkml.kernel.org/r/20230628104852.3391651-1-dhowells@redhat.com Link: https://lkml.kernel.org/r/20230628104852.3391651-2-dhowells@redhat.com Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Signed-off-by: David Howells Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Xiubo Li Cc: Jingbo Xu Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 12 ++++-------- fs/splice.c | 3 +-- mm/filemap.c | 2 ++ mm/huge_memory.c | 3 +-- mm/internal.h | 8 ++++++++ mm/khugepaged.c | 3 +-- mm/memory-failure.c | 8 +++----- mm/migrate.c | 3 +-- mm/truncate.c | 6 ++---- mm/vmscan.c | 8 ++++---- 10 files changed, 27 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f4b4861a74ee..18a9e7c47975 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -340,10 +340,8 @@ again: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +360,8 @@ data_copy: /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } diff --git a/fs/splice.c b/fs/splice.c index 3e2a31e1ce6a..0d3deeb3857e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* diff --git a/mm/filemap.c b/mm/filemap.c index 93e495d2d477..dd022b065614 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4073,6 +4073,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); + if (!folio_needs_release(folio)) + return true; if (folio_test_writeback(folio)) return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97..9f3109ed7351 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2697,8 +2697,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); - if (folio_test_private(folio) && - !filemap_release_folio(folio, gfp)) { + if (!filemap_release_folio(folio, gfp)) { ret = -EBUSY; goto out; } diff --git a/mm/internal.h b/mm/internal.h index 721ed07d7fd6..822b13de3780 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -176,6 +176,14 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * Return true if a folio needs ->release_folio() calling upon it. + */ +static inline bool folio_needs_release(struct folio *folio) +{ + return folio_has_private(folio); +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4b8b8673d5d9..4e707da4a83c 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2078,8 +2078,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) { + if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d87f0b8b805..76da955bf10f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -936,14 +936,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn, struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { + if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_NOIO)) { + else if (!filemap_release_folio(folio, GFP_NOIO)) pr_info("%#lx: failed to release buffers\n", pfn); - } else { + else ret = MF_RECOVERED; - } } else { /* * If the file system doesn't support it just invalidate diff --git a/mm/migrate.c b/mm/migrate.c index 24baad2571e3..e9821e245e70 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -922,8 +922,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (folio_test_private(src) && - !filemap_release_folio(src, GFP_KERNEL)) + if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_folio(mapping, dst, src, mode); diff --git a/mm/truncate.c b/mm/truncate.c index 2f28cc0e12ef..bd4fafd67f95 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,7 +19,6 @@ #include #include #include -#include /* grr. try_to_release_page */ #include #include #include "internal.h" @@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping, if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; - if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) + if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); @@ -573,8 +572,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, if (folio->mapping != mapping) return 0; - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1080209a568b..4039620d30fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2064,7 +2064,7 @@ retry: * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ - if (folio_has_private(folio)) { + if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { @@ -2729,9 +2729,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_test_private(folio) && folio_trylock(folio)) { - if (folio_test_private(folio)) - filemap_release_folio(folio, 0); + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); folio_unlock(folio); } } -- cgit v1.2.3 From b4fa966f03b7401ceacd4ffd7227197afb2b8376 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:52 +0100 Subject: mm, netfs, fscache: stop read optimisation when folio removed from pagecache Fscache has an optimisation by which reads from the cache are skipped until we know that (a) there's data there to be read and (b) that data isn't entirely covered by pages resident in the netfs pagecache. This is done with two flags manipulated by fscache_note_page_release(): if (... test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to indicate that netfslib should download from the server or clear the page instead. The fscache_note_page_release() function is intended to be called from ->releasepage() - but that only gets called if PG_private or PG_private_2 is set - and currently the former is at the discretion of the network filesystem and the latter is only set whilst a page is being written to the cache, so sometimes we miss clearing the optimisation. Fix this by following Willy's suggestion[1] and adding an address_space flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call ->release_folio() if it's set, even if PG_private or PG_private_2 aren't set. Note that this would require folio_test_private() and page_has_private() to become more complicated. To avoid that, in the places[*] where these are used to conditionalise calls to filemap_release_folio() and try_to_release_page(), the tests are removed the those functions just jumped to unconditionally and the test is performed there. [*] There are some exceptions in vmscan.c where the check guards more than just a call to the releaser. I've added a function, folio_needs_release() to wrap all the checks for that. AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from fscache and cleared in ->evict_inode() before truncate_inode_pages_final() is called. Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared and the optimisation cancelled if a cachefiles object already contains data when we open it. [dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()] Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page") Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Signed-off-by: Dave Wysochanski Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Tested-by: SeongJae Park Cc: Daire Byrne Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: Andreas Dilger Cc: Jingbo Xu Cc: "Theodore Ts'o" Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/9p/cache.c | 2 ++ fs/afs/internal.h | 2 ++ fs/cachefiles/namei.c | 2 ++ fs/ceph/cache.c | 2 ++ fs/nfs/fscache.c | 3 +++ fs/smb/client/fscache.c | 2 ++ include/linux/pagemap.h | 16 ++++++++++++++++ mm/internal.h | 5 ++++- 8 files changed, 33 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b5..12c0ae29f185 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9d3d64921106..da73b97e19a9 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d9d22d0ec38a..7bf7a5fcc045 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe..de1dee46d3df 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b1..b05717fe0d4e 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index 8f6909d633da..3677525ee993 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716953ee1ebd..0ab0f2362b9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -203,6 +203,7 @@ enum mapping_flags { /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, + AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ }; /** @@ -273,6 +274,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } +static inline bool mapping_release_always(const struct address_space *mapping) +{ + return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_set_release_always(struct address_space *mapping) +{ + set_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_clear_release_always(struct address_space *mapping) +{ + clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/internal.h b/mm/internal.h index 822b13de3780..483add0bfb28 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -181,7 +181,10 @@ static inline void set_page_refcounted(struct page *page) */ static inline bool folio_needs_release(struct folio *folio) { - return folio_has_private(folio); + struct address_space *mapping = folio_mapping(folio); + + return folio_has_private(folio) || + (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; -- cgit v1.2.3 From 9651eeab3c5fceb45d06230bf0839337206450ad Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 7 Jul 2023 23:39:53 +0800 Subject: mm: correct stale comment of function check_pte Commit 2aff7a4755bed ("mm: Convert page_vma_mapped_walk to work on PFNs") replaced page with pfns in page_vma_mapped_walk structure and updated "@pvmw->page" to "@pvmw->pfn" in comment of function page_vma_mapped_walk. This patch update stale "page" to "pfn" in comment of check_pte. Link: https://lkml.kernel.org/r/20230707153953.1380615-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 49e0d28f0379..e0b368e545ed 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -73,20 +73,22 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) } /** - * check_pte - check if @pvmw->page is mapped at the @pvmw->pte - * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking + * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is + * mapped at the @pvmw->pte + * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range + * for checking * - * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* + * page_vma_mapped_walk() found a place where pfn range is *potentially* * mapped. check_pte() has to validate this. * * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to * arbitrary page. * * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration - * entry that points to @pvmw->page or any subpage in case of THP. + * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) * * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to - * pvmw->page or any subpage in case of THP. + * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) * * Otherwise, return false. * -- cgit v1.2.3 From 809ef83ccb61fedc951eccf876a327e940bc412a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 7 Jul 2023 17:00:34 +0800 Subject: mm: fix some kernel-doc comments Add description of @mm_wr_locked and @mm. to silence the warnings: mm/memory.c:1716: warning: Function parameter or member 'mm_wr_locked' not described in 'unmap_vmas' mm/memory.c:5110: warning: Function parameter or member 'mm' not described in 'mm_account_fault' Link: https://lkml.kernel.org/r/20230707090034.125511-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5f863b1a0edc..7fb87a9c025a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1697,6 +1697,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping + * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -5084,7 +5085,7 @@ retry_pud: /** * mm_account_fault - Do page fault accounting - * + * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. -- cgit v1.2.3 From 94ec20035b05f842dc08277a5a90fba757088f39 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Jul 2023 16:51:46 +0800 Subject: mm: compaction: use the correct type of list for free pages Use the page->buddy_list instead of page->lru to clarify the correct type of list for free pages. Link: https://lkml.kernel.org/r/b21cd8e2e32b9a1d9bc9e43ebf8acaf35e87f8df.1688715750.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Huang, Ying Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index dbc9f86b1934..43358efdbdc2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1500,7 +1500,7 @@ static void fast_isolate_freepages(struct compact_control *cc) spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; - list_for_each_entry_reverse(freepage, freelist, lru) { + list_for_each_entry_reverse(freepage, freelist, buddy_list) { unsigned long pfn; order_scanned++; @@ -1883,7 +1883,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; - list_for_each_entry(freepage, freelist, lru) { + list_for_each_entry(freepage, freelist, buddy_list) { unsigned long free_pfn; if (nr_scanned++ >= limit) { -- cgit v1.2.3 From e6e0c7673012f42c6fb8d89af71cd7607c93e0a5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Jul 2023 16:51:47 +0800 Subject: mm: compaction: skip the memory hole rapidly when isolating free pages Just like commit 9721fd82351d ("mm: compaction: skip memory hole rapidly when isolating migratable pages"), I can see it will also take more time to skip the larger memory hole (range: 0x1000000000 - 0x1800000000) when isolating free pages on my machine with below memory layout. So like commit 9721fd82351d, adding a new helper to skip the memory hole rapidly, which can reduce the time consumed from about 70us to less than 1us. [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000040000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x0000001fa7ffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000040000000-0x0000000fffffffff] [ 0.000000] node 0: [mem 0x0000001800000000-0x0000001fa3c7ffff] [ 0.000000] node 0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff] [ 0.000000] node 0: [mem 0x0000001fa4000000-0x0000001fa402ffff] [ 0.000000] node 0: [mem 0x0000001fa4030000-0x0000001fa40effff] [ 0.000000] node 0: [mem 0x0000001fa40f0000-0x0000001fa73cffff] [ 0.000000] node 0: [mem 0x0000001fa73d0000-0x0000001fa745ffff] [ 0.000000] node 0: [mem 0x0000001fa7460000-0x0000001fa746ffff] [ 0.000000] node 0: [mem 0x0000001fa7470000-0x0000001fa758ffff] [ 0.000000] node 0: [mem 0x0000001fa7590000-0x0000001fa7ffffff] [shikemeng@huaweicloud.com: avoid missing last page block in section after skip offline sections] Link: https://lkml.kernel.org/r/20230804110454.2935878-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230804110454.2935878-2-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/d2ba7e41ee566309b594311207ffca736375fc16.1688715750.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 43358efdbdc2..02239abed6ce 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -249,11 +249,36 @@ static unsigned long skip_offline_sections(unsigned long start_pfn) return 0; } + +/* + * If the PFN falls into an offline section, return the end PFN of the + * next online section in reverse. If the PFN falls into an online section + * or if there is no next online section in reverse, return 0. + */ +static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) +{ + unsigned long start_nr = pfn_to_section_nr(start_pfn); + + if (!start_nr || online_section_nr(start_nr)) + return 0; + + while (start_nr-- > 0) { + if (online_section_nr(start_nr)) + return section_nr_to_pfn(start_nr) + PAGES_PER_SECTION; + } + + return 0; +} #else static unsigned long skip_offline_sections(unsigned long start_pfn) { return 0; } + +static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) +{ + return 0; +} #endif /* @@ -1668,8 +1693,15 @@ static void isolate_freepages(struct compact_control *cc) page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); - if (!page) + if (!page) { + unsigned long next_pfn; + + next_pfn = skip_offline_sections_reverse(block_start_pfn); + if (next_pfn) + block_start_pfn = max(next_pfn, low_pfn); + continue; + } /* Check the block is suitable for migration */ if (!suitable_migration_target(cc, page)) -- cgit v1.2.3 From c200a7119bc7dc9430e8287563e5343b154ff9d0 Mon Sep 17 00:00:00 2001 From: liuq Date: Fri, 7 Jul 2023 14:05:01 +0800 Subject: mm/sparse: remove redundant judgments from macro for_each_present_section_nr next_present_section_nr() has already ensured that 'section_nr<=__highest_present_section_nr', so this check is removed. Link: https://lkml.kernel.org/r/20230707060501.29184-1-liuq131@chinatelecom.cn Signed-off-by: liuq Signed-off-by: Andrew Morton --- mm/sparse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 297a8b772e8d..77d91e565045 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -172,8 +172,7 @@ static void __section_mark_present(struct mem_section *ms, #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ - ((section_nr != -1) && \ - (section_nr <= __highest_present_section_nr)); \ + section_nr != -1; \ section_nr = next_present_section_nr(section_nr)) static inline unsigned long first_present_section_nr(void) -- cgit v1.2.3 From 3ce2c24cb68f228590a053d6058a5901cd31af61 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 7 Jul 2023 11:38:59 +0800 Subject: mm: hugetlb_vmemmap: fix a race between vmemmap pmd split The local variable @page in __split_vmemmap_huge_pmd() to obtain a pmd page without holding page_table_lock may possiblely get the page table page instead of a huge pmd page. The effect may be in set_pte_at() since we may pass an invalid page struct, if set_pte_at() wants to access the page struct (e.g. CONFIG_PAGE_TABLE_CHECK is enabled), it may crash the kernel. So fix it. And inline __split_vmemmap_huge_pmd() since it only has one user. Link: https://lkml.kernel.org/r/20230707033859.16148-1-songmuchun@bytedance.com Fixes: d8d55f5616cf ("mm: sparsemem: use page table lock to protect kernel pmd operations") Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c2007ef5e9b0..4b9734777f69 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -36,14 +36,22 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; unsigned long addr = start; - struct page *page = pmd_page(*pmd); - pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + struct page *head; + pte_t *pgtable; + + spin_lock(&init_mm.page_table_lock); + head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; + spin_unlock(&init_mm.page_table_lock); + if (!head) + return 0; + + pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; @@ -53,7 +61,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; - entry = mk_pte(page + i, pgprot); + entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } @@ -65,8 +73,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) * be treated as indepdenent small pages (as they can be freed * individually). */ - if (!PageReserved(page)) - split_page(page, get_order(PMD_SIZE)); + if (!PageReserved(head)) + split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); @@ -80,20 +88,6 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) return 0; } -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - int leaf; - - spin_lock(&init_mm.page_table_lock); - leaf = pmd_leaf(*pmd); - spin_unlock(&init_mm.page_table_lock); - - if (!leaf) - return 0; - - return __split_vmemmap_huge_pmd(pmd, start); -} - static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) -- cgit v1.2.3 From 3d243659d94fd6d521c4573ec467bacef911ccb3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:44 -0700 Subject: mm/memory: convert do_page_mkwrite() to use folios Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7fb87a9c025a..5209f3d80948 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2933,7 +2933,7 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { vm_fault_t ret; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; @@ -2948,14 +2948,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - unlock_page(page); + folio_lock(folio); + if (!folio->mapping) { + folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } -- cgit v1.2.3 From 5a97858b51658ccb1a20a3273eb9fedf8fcef6a5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:45 -0700 Subject: mm/memory: convert wp_page_shared() to use folios Saves six implicit calls to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 5209f3d80948..a88e57d927bd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3283,13 +3283,13 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return 0; } -static vm_fault_t wp_page_shared(struct vm_fault *vmf) +static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; - get_page(vmf->page); + folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; @@ -3298,21 +3298,21 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(vmf->page); + folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - unlock_page(vmf->page); - put_page(vmf->page); + folio_unlock(folio); + folio_put(folio); return tmp; } } else { wp_page_reuse(vmf); - lock_page(vmf->page); + folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); - put_page(vmf->page); + folio_put(folio); return ret; } @@ -3363,6 +3363,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (vmf->page) + folio = page_folio(vmf->page); + /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. @@ -3377,12 +3380,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) */ if (!vmf->page) return wp_pfn_shared(vmf); - return wp_page_shared(vmf); + return wp_page_shared(vmf, folio); } - if (vmf->page) - folio = page_folio(vmf->page); - /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. -- cgit v1.2.3 From 6f609b7e37dff1e8b2261e93da8e2e9848d5513c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:46 -0700 Subject: mm/memory: convert do_shared_fault() to folios Saves three implicit calls to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index a88e57d927bd..7bebd6909199 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4602,21 +4602,24 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; + struct folio *folio; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + folio = page_folio(vmf->page); + /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(vmf->page); + folio_unlock(folio); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(vmf->page); + folio_put(folio); return tmp; } } @@ -4624,8 +4627,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(vmf->page); - put_page(vmf->page); + folio_unlock(folio); + folio_put(folio); return ret; } -- cgit v1.2.3 From 22d1e68f5a23f8b068da77af6d037bc73748c6e3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:47 -0700 Subject: mm/memory: convert do_read_fault() to use folios Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 7bebd6909199..ff19719da032 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4536,6 +4536,7 @@ static inline bool should_fault_around(struct vm_fault *vmf) static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; + struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback @@ -4553,9 +4554,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; ret |= finish_fault(vmf); - unlock_page(vmf->page); + folio = page_folio(vmf->page); + folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(vmf->page); + folio_put(folio); return ret; } -- cgit v1.2.3 From 60b1e24ce8c3334d9204d6229356b750632136be Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 8 Jul 2023 10:33:04 +0800 Subject: mm/memcg: minor cleanup for MEM_CGROUP_ID_MAX MEM_CGROUP_ID_MAX is only used when CONFIG_MEMCG is configured. So remove unneeded !CONFIG_MEMCG variant. Also it's only used in mem_cgroup_alloc(), so move it from memcontrol.h to memcontrol.c. And further define it as: #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) so if someone changes MEM_CGROUP_ID_SHIFT in the future, then MEM_CGROUP_ID_MAX will be updated accordingly, as suggested by Muchun. Link: https://lkml.kernel.org/r/20230708023304.1184111-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5818af8eca5a..58eb7ca65699 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -61,7 +61,6 @@ struct mem_cgroup_reclaim_cookie { #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 -#define MEM_CGROUP_ID_MAX USHRT_MAX struct mem_cgroup_id { int id; @@ -1158,7 +1157,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 -#define MEM_CGROUP_ID_MAX 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab99503c9ff2..3eaeb69ef9f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5155,6 +5155,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * those references are manageable from userspace. */ +#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) -- cgit v1.2.3 From af19487f00f34ff8643921d7909dbb3fedc7e329 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:33 -0700 Subject: mm: make PTE_MARKER_SWAPIN_ERROR more general Patch series "add UFFDIO_POISON to simulate memory poisoning with UFFD", v4. This series adds a new userfaultfd feature, UFFDIO_POISON. See commit 4 for a detailed description of the feature. This patch (of 8): Future patches will reuse PTE_MARKER_SWAPIN_ERROR to implement UFFDIO_POISON, so make some various preparations for that: First, rename it to just PTE_MARKER_POISONED. The "SWAPIN" can be confusing since we're going to re-use it for something not really related to swap. This can be particularly confusing for things like hugetlbfs, which doesn't support swap whatsoever. Also rename some various helper functions. Next, fix pte marker copying for hugetlbfs. Previously, it would WARN on seeing a PTE_MARKER_SWAPIN_ERROR, since hugetlbfs doesn't support swap. But, since we're going to re-use it, we want it to go ahead and copy it just like non-hugetlbfs memory does today. Since the code to do this is more complicated now, pull it out into a helper which can be re-used in both places. While we're at it, also make it slightly more explicit in its handling of e.g. uffd wp markers. For non-hugetlbfs page faults, instead of returning VM_FAULT_SIGBUS for an error entry, return VM_FAULT_HWPOISON. For most cases this change doesn't matter, e.g. a userspace program would receive a SIGBUS either way. But for UFFDIO_POISON, this change will let KVM guests get an MCE out of the box, instead of giving a SIGBUS to the hypervisor and requiring it to somehow inject an MCE. Finally, for hugetlbfs faults, handle PTE_MARKER_POISONED, and return VM_FAULT_HWPOISON_LARGE in such cases. Note that this can't happen today because the lack of swap support means we'll never end up with such a PTE anyway, but this behavior will be needed once such entries *can* show up via UFFDIO_POISON. Link: https://lkml.kernel.org/r/20230707215540.2324998-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230707215540.2324998-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 19 +++++++++++++++++++ include/linux/swapops.h | 15 ++++++++++----- mm/hugetlb.c | 32 +++++++++++++++++++++----------- mm/madvise.c | 2 +- mm/memory.c | 15 +++++++++------ mm/mprotect.c | 4 ++-- mm/shmem.c | 4 ++-- mm/swapfile.c | 2 +- 8 files changed, 65 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 21d6c72bcc71..a86c84600787 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -523,6 +523,25 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +/* + * Computes the pte marker to copy from the given source entry into dst_vma. + * If no marker should be copied, returns 0. + * The caller should insert a new pte created with make_pte_marker(). + */ +static inline pte_marker copy_pte_marker( + swp_entry_t entry, struct vm_area_struct *dst_vma) +{ + pte_marker srcm = pte_marker_get(entry); + /* Always copy error entries. */ + pte_marker dstm = srcm & PTE_MARKER_POISONED; + + /* Only copy PTE markers if UFFD register matches. */ + if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) + dstm |= PTE_MARKER_UFFD_WP; + + return dstm; +} + /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4c932cb45e0b..bff1e8d97de0 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_SWAPIN_ERROR BIT(1) +/* + * "Poisoned" here is meant in the very general sense of "future accesses are + * invalid", instead of referring very specifically to hardware memory errors. + * This marker is meant to represent any of various different causes of this. + */ +#define PTE_MARKER_POISONED BIT(1) #define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) @@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } -static inline swp_entry_t make_swapin_error_entry(void) +static inline swp_entry_t make_poisoned_swp_entry(void) { - return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); + return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_swapin_error_entry(swp_entry_t entry) +static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); + (pte_marker_get(entry) & PTE_MARKER_POISONED); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3839eee4657..ffee2978dfed 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -5101,15 +5102,12 @@ again: entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { - /* No swap on hugetlb */ - WARN_ON_ONCE( - is_swapin_error_entry(pte_to_swp_entry(entry))); - /* - * We copy the pte marker only if the dst vma has - * uffd-wp enabled. - */ - if (userfaultfd_wp(dst_vma)) - set_huge_pte_at(dst, addr, dst_pte, entry); + pte_marker marker = copy_pte_marker( + pte_to_swp_entry(entry), dst_vma); + + if (marker) + set_huge_pte_at(dst, addr, dst_pte, + make_pte_marker(marker)); } else { entry = huge_ptep_get(src_pte); pte_folio = page_folio(pte_page(entry)); @@ -6089,14 +6087,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_ptep_get(ptep); - /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) + if (huge_pte_none_mostly(entry)) { + if (is_pte_marker(entry)) { + pte_marker marker = + pte_marker_get(pte_to_swp_entry(entry)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE; + goto out_mutex; + } + } + /* + * Other PTE markers should be handled the same way as none PTE. + * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); + } ret = 0; diff --git a/mm/madvise.c b/mm/madvise.c index 05f97038eac3..da65f8bd9ac3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -664,7 +664,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, free_swap_and_cache(entry); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } else if (is_hwpoison_entry(entry) || - is_swapin_error_entry(entry)) { + is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; diff --git a/mm/memory.c b/mm/memory.c index ff19719da032..36b164ee9ffb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -860,8 +860,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) - set_pte_at(dst_mm, addr, dst_pte, pte); + pte_marker marker = copy_pte_marker(entry, dst_vma); + + if (marker) + set_pte_at(dst_mm, addr, dst_pte, + make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -1502,7 +1505,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, !zap_drop_file_uffd_wp(details)) continue; } else if (is_hwpoison_entry(entry) || - is_swapin_error_entry(entry)) { + is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) continue; } else { @@ -3651,7 +3654,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed - * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) @@ -3697,8 +3700,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ - if (marker & PTE_MARKER_SWAPIN_ERROR) - return VM_FAULT_SIGBUS; + if (marker & PTE_MARKER_POISONED) + return VM_FAULT_HWPOISON; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); diff --git a/mm/mprotect.c b/mm/mprotect.c index 6f658d483704..5c3112d92466 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -230,10 +230,10 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mkuffd_wp(newpte); } else if (is_pte_marker_entry(entry)) { /* - * Ignore swapin errors unconditionally, + * Ignore error swap entries unconditionally, * because any access should sigbus anyway. */ - if (is_swapin_error_entry(entry)) + if (is_poisoned_swp_entry(entry)) continue; /* * If this is uffd-wp pte marker and we'd like diff --git a/mm/shmem.c b/mm/shmem.c index 8dfd72bdc86a..235f2b2fd202 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1707,7 +1707,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(); + swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); @@ -1752,7 +1752,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*foliop); *foliop = NULL; - if (is_swapin_error_entry(swap)) + if (is_poisoned_swp_entry(swap)) return -EIO; si = get_swap_device(swap); diff --git a/mm/swapfile.c b/mm/swapfile.c index d996c335fc3c..346e22b8ae97 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1771,7 +1771,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, swp_entry = make_hwpoison_entry(swapcache); page = swapcache; } else { - swp_entry = make_swapin_error_entry(); + swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; -- cgit v1.2.3 From 435cdb41a76fcfa5d6af7e0e39bb8ab5ef4b7a64 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:35 -0700 Subject: mm: userfaultfd: extract file size check out into a helper This code is already duplicated twice, and UFFDIO_POISON will do the same check a third time. So, it's worth extracting into a helper to save repetitive lines of code. Link: https://lkml.kernel.org/r/20230707215540.2324998-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a2bf37ee276d..4244ca7ee903 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -45,6 +45,22 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, return dst_vma; } +/* Check if dst_addr is outside of file's size. Must be called with ptl held. */ +static bool mfill_file_over_size(struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct inode *inode; + pgoff_t offset, max_off; + + if (!dst_vma->vm_file) + return false; + + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + return offset >= max_off; +} + /* * Install PTEs, to map dst_addr (within dst_vma) to page. * @@ -64,8 +80,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, bool page_in_cache = page_mapping(page); spinlock_t *ptl; struct folio *folio; - struct inode *inode; - pgoff_t offset, max_off; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -81,14 +95,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (!dst_pte) goto out; - if (vma_is_shmem(dst_vma)) { - /* serialize against truncate with the page table lock */ - inode = dst_vma->vm_file->f_inode; - offset = linear_page_index(dst_vma, dst_addr); - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; - if (unlikely(offset >= max_off)) - goto out_unlock; + goto out_unlock; } ret = -EEXIST; @@ -211,8 +220,6 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; - pgoff_t offset, max_off; - struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); @@ -220,14 +227,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; - if (dst_vma->vm_file) { - /* the shmem MAP_PRIVATE case requires checking the i_size */ - inode = dst_vma->vm_file->f_inode; - offset = linear_page_index(dst_vma, dst_addr); - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; - if (unlikely(offset >= max_off)) - goto out_unlock; + goto out_unlock; } ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) -- cgit v1.2.3 From fc71884a5f599a603fcc3c2b28b3872c09d19c18 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:36 -0700 Subject: mm: userfaultfd: add new UFFDIO_POISON ioctl The basic idea here is to "simulate" memory poisoning for VMs. A VM running on some host might encounter a memory error, after which some page(s) are poisoned (i.e., future accesses SIGBUS). They expect that once poisoned, pages can never become "un-poisoned". So, when we live migrate the VM, we need to preserve the poisoned status of these pages. When live migrating, we try to get the guest running on its new host as quickly as possible. So, we start it running before all memory has been copied, and before we're certain which pages should be poisoned or not. So the basic way to use this new feature is: - On the new host, the guest's memory is registered with userfaultfd, in either MISSING or MINOR mode (doesn't really matter for this purpose). - On any first access, we get a userfaultfd event. At this point we can communicate with the old host to find out if the page was poisoned. - If so, we can respond with a UFFDIO_POISON - this places a swap marker so any future accesses will SIGBUS. Because the pte is now "present", future accesses won't generate more userfaultfd events, they'll just SIGBUS directly. UFFDIO_POISON does not handle unmapping previously-present PTEs. This isn't needed, because during live migration we want to intercept all accesses with userfaultfd (not just writes, so WP mode isn't useful for this). So whether minor or missing mode is being used (or both), the PTE won't be present in any case, so handling that case isn't needed. Similarly, UFFDIO_POISON won't replace existing PTE markers. This might be okay to do, but it seems to be safer to just refuse to overwrite any existing entry (like a UFFD_WP PTE marker). Link: https://lkml.kernel.org/r/20230707215540.2324998-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 58 ++++++++++++++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 4 +++ include/uapi/linux/userfaultfd.h | 16 +++++++++++ mm/userfaultfd.c | 48 ++++++++++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c2ed7dcf494e..9854d44ae18e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1967,6 +1967,61 @@ out: return ret; } +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len, + &ctx->mmap_changing, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2068,6 +2123,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_CONTINUE: ret = userfaultfd_continue(ctx, arg); break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; } return ret; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac7b0c96d351..ac8c6854097c 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -46,6 +46,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, + MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 66dd4cd277bd..b5f07eacc697 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -71,6 +71,7 @@ #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -91,6 +92,8 @@ struct uffdio_writeprotect) #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) /* read() structure */ struct uffd_msg { @@ -225,6 +228,7 @@ struct uffdio_api { #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) __u64 features; __u64 ioctls; @@ -321,6 +325,18 @@ struct uffdio_continue { __s64 mapped; }; +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4244ca7ee903..68157359dc34 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -288,6 +288,40 @@ out_release: goto out; } +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ +static int mfill_atomic_pte_poison(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) +{ + int ret; + struct mm_struct *dst_mm = dst_vma->vm_mm; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + + _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + if (mfill_file_over_size(dst_vma, dst_addr)) { + ret = -EFAULT; + goto out_unlock; + } + + ret = -EEXIST; + /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ + if (!pte_none(*dst_pte)) + goto out_unlock; + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; @@ -339,7 +373,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) || + uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -483,6 +518,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { return mfill_atomic_pte_continue(dst_pmd, dst_vma, dst_addr, flags); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + return mfill_atomic_pte_poison(dst_pmd, dst_vma, + dst_addr, flags); } /* @@ -704,6 +742,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } +ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags) +{ + return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); +} + long uffd_wp_range(struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { -- cgit v1.2.3 From 597425df4fecd272ca48f73feca7833433c16e12 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 18:27:17 -0700 Subject: mm: userfaultfd: add new UFFDIO_POISON ioctl: fix Smatch has observed that pte_offset_map_lock() is now allowed to fail, and then ptl should not be unlocked. Use -EAGAIN here like elsewhere. Link: https://lkml.kernel.org/r/bc7bba61-d34f-ad3a-ccf1-c191585ef851@google.com Signed-off-by: Hugh Dickins Reviewed-by: Axel Rasmussen Cc: Dan Carpenter Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 68157359dc34..dd167184575e 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -300,7 +300,10 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, spinlock_t *ptl; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; @@ -319,6 +322,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); +out: return ret; } -- cgit v1.2.3 From 8a13897fb0daa8f56821f263f0c63661e1c6acae Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:37 -0700 Subject: mm: userfaultfd: support UFFDIO_POISON for hugetlbfs The behavior here is the same as it is for anon/shmem. This is done separately because hugetlb pte marker handling is a bit different. Link: https://lkml.kernel.org/r/20230707215540.2324998-6-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- mm/hugetlb.c | 19 +++++++++++++++++++ mm/userfaultfd.c | 3 +-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ffee2978dfed..7b076eb07a29 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6262,6 +6262,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int writable; bool folio_in_pagecache = false; + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + ptl = huge_pte_lock(h, dst_mm, dst_pte); + + /* Don't overwrite any existing PTEs (even markers) */ + if (!huge_pte_none(huge_ptep_get(dst_pte))) { + spin_unlock(ptl); + return -EEXIST; + } + + _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + return 0; + } + if (is_continue) { ret = -EFAULT; folio = filemap_lock_folio(mapping, idx); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dd167184575e..0fc69efa4f1f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -377,8 +377,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) || - uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { mmap_read_unlock(dst_mm); return -EINVAL; } -- cgit v1.2.3 From f9044f170c5e89a12116066d1b9b932e934b9efb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sun, 9 Jul 2023 11:56:26 +0900 Subject: zsmalloc: remove obj_tagged() obj_tagged() is not needed at this point, because objects can only have one tag: OBJ_ALLOCATED_TAG. We needed obj_tagged() for the zsmalloc LRU implementation, which has now been removed. Simplify zsmalloc code and revert to the previous implementation that was in place before the zsmalloc LRU series. Link: https://lkml.kernel.org/r/20230709025817.3842416-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Nhat Pham Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 84beadc088b8..32f5bc4074df 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -795,8 +795,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, - int tag) +static inline bool obj_allocated(struct page *page, void *obj, + unsigned long *phandle) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -807,7 +807,7 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, } else handle = *(unsigned long *)obj; - if (!(handle & tag)) + if (!(handle & OBJ_ALLOCATED_TAG)) return false; /* Clear all tags before returning the handle */ @@ -815,11 +815,6 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, return true; } -static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) -{ - return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); -} - static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -1551,11 +1546,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find object with a certain tag in zspage from index object and + * Find alloced object in zspage from index object and * return handle. */ -static unsigned long find_tagged_obj(struct size_class *class, - struct page *page, int *obj_idx, int tag) +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) { unsigned int offset; int index = *obj_idx; @@ -1566,7 +1561,7 @@ static unsigned long find_tagged_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_tagged(page, addr + offset, &handle, tag)) + if (obj_allocated(page, addr + offset, &handle)) break; offset += class->size; @@ -1580,16 +1575,6 @@ static unsigned long find_tagged_obj(struct size_class *class, return handle; } -/* - * Find alloced object in zspage from index object and - * return handle. - */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) -{ - return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); -} - static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, struct zspage *dst_zspage) { -- cgit v1.2.3 From b894da0468640f610d47624e872dc11f2ae5bb4b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 10 Jul 2023 09:37:50 +0000 Subject: mm/mm_init.c: mark check_for_memory() as __init The only caller of check_for_memory() is free_area_init(), which is annotated with __init, so it should be safe to also mark the former as __init. Link: https://lkml.kernel.org/r/20230710093750.1294-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index f90db54e2b21..2daae1dd5755 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1736,7 +1736,7 @@ static void __init free_area_init_node(int nid) } /* Any regular or high memory on that node ? */ -static void check_for_memory(pg_data_t *pgdat) +static void __init check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; -- cgit v1.2.3 From de7cb03db05a4b460edefff266bbaead70a11634 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Jul 2023 19:40:50 +0200 Subject: mm/memory_hotplug: document the signal_pending() check in offline_pages() Let's update the documentation that any signal is sufficient, and add a comment that not only checking for fatal signals is historical baggage: changing it now could break existing user space. although unlikely. For example, when an app provides a custom SIGALRM handler and triggers memory offlining, the timeout cmd would no longer stop memory offlining, because SIGALRM would no longer be considered a fatal signal. Note that using signal_pending() instead of fatal_signal_pending() is an anti-pattern, but slowly deprecating that behavior to eventually change it in the far future is probably not worth the effort. If this ever becomes relevant for user-space, we might want to rethink. Link: https://lkml.kernel.org/r/20230711174050.603820-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 2 +- mm/memory_hotplug.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 1b02fe5807cc..bd77841041af 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -669,7 +669,7 @@ when still encountering permanently unmovable pages within ZONE_MOVABLE (-> BUG), memory offlining will keep retrying until it eventually succeeds. When offlining is triggered from user space, the offlining context can be -terminated by sending a fatal signal. A timeout based offlining can easily be +terminated by sending a signal. A timeout based offlining can easily be implemented via:: % timeout $TIMEOUT offline_block | failure_handling diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3f231cf1b410..7cfd13c91568 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1843,6 +1843,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, do { pfn = start_pfn; do { + /* + * Historically we always checked for any signal and + * can't limit it to fatal signals without eventually + * breaking user space. + */ if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; -- cgit v1.2.3 From dbe70dbb41ab45a9ea2fa537c9e6c9817477dfff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:09 +0800 Subject: mm: memory-failure: remove unneeded PageHuge() check Patch series "A few fixup and cleanup patches for memory-failure", v2. This series contains a few fixup patches to fix inaccurate mf_stats, fix race window when trying to get hugetlb folio and so on. Also there is minor cleanup for comments and codestyle. More details can be found in the respective changelogs. This patch (of 8): PageHuge() check in me_huge_page() is just for potential problems. Remove it as it's actually dead code and won't catch anything. Link: https://lkml.kernel.org/r/20230711055016.2286677-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20230711055016.2286677-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 76da955bf10f..9ed2bd258269 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1187,9 +1187,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) struct address_space *mapping; bool extra_pins = false; - if (!PageHuge(hpage)) - return MF_DELAYED; - mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); -- cgit v1.2.3 From 92a025a790f82c278cc39b0997e9b3b6f3b69ee0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:10 +0800 Subject: mm: memory-failure: ensure moving HWPoison flag to the raw error pages If hugetlb_vmemmap_optimized is enabled, folio_clear_hugetlb_hwpoison() called from try_memory_failure_hugetlb() won't transfer HWPoison flag to subpages while folio's HWPoison flag is cleared. So when trying to free this hugetlb page into buddy, folio_clear_hugetlb_hwpoison() is not called to move HWPoison flag from head page to the raw error pages even if now hugetlb_vmemmap_optimized is cleared. This will results in HWPoisoned page being used again and raw_hwp_page leak. Link: https://lkml.kernel.org/r/20230711055016.2286677-3-linmiaohe@huawei.com Fixes: ac5fcde0a96a ("mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9ed2bd258269..71b4bb691c47 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1908,6 +1908,8 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) { if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; + if (folio_test_hugetlb_vmemmap_optimized(folio)) + return; folio_clear_hwpoison(folio); folio_free_raw_hwp(folio, true); } -- cgit v1.2.3 From 80ee7cb271b52e5861eda3c67731c95fd55a2627 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:11 +0800 Subject: mm: memory-failure: don't account hwpoison_filter() filtered pages mf_generic_kill_procs() will return -EOPNOTSUPP when hwpoison_filter() filtered dax page. In that case, action_result() isn't expected to be called to update mf_stats. This will results in inaccurate but benign memory failure handling statistics. Link: https://lkml.kernel.org/r/20230711055016.2286677-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 71b4bb691c47..cac5413c5cc3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2098,7 +2098,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); - action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); + if (rc != -EOPNOTSUPP) + action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); return rc; } -- cgit v1.2.3 From 55c7ac4527086d52dedc5da4ee3d676bcc9a7691 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:12 +0800 Subject: mm: memory-failure: use local variable huge to check hugetlb page Use local variable huge to check whether page is hugetlb page to avoid calling PageHuge() multiple times to save cpu cycles. PageHuge() will be stable while extra page refcnt is held. Link: https://lkml.kernel.org/r/20230711055016.2286677-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cac5413c5cc3..3b734d51e6de 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2617,7 +2617,7 @@ static int soft_offline_in_use_page(struct page *page) } lock_page(page); - if (!PageHuge(page)) + if (!huge) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); @@ -2626,7 +2626,7 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) + if (!huge && PageLRU(page) && !PageSwapCache(page)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. -- cgit v1.2.3 From e9c36f7aca7efee8318b12930b846464b9b5c7a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:13 +0800 Subject: mm: memory-failure: remove unneeded header files Remove some unneeded header files. No functional change intended. Link: https://lkml.kernel.org/r/20230711055016.2286677-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3b734d51e6de..44a0ce3849f3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -50,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -59,7 +57,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 5885c6a62533cbda19e9eceab619bde317de0c0d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:14 +0800 Subject: mm: memory-failure: minor cleanup for comments and codestyle Fix some wrong function names and grammar error in comments. Also remove unneeded space after for_each_process. No functional change intended. Link: https://lkml.kernel.org/r/20230711055016.2286677-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44a0ce3849f3..36529f3c6554 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -608,7 +608,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); - for_each_process (tsk) { + for_each_process(tsk) { struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); @@ -652,7 +652,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, /* * Send early kill signal to tasks where a vma covers * the page but the corrupted page is not necessarily - * mapped it in its pte. + * mapped in its pte. * Assume applications who requested early kill want * to be informed of all such data corruptions. */ @@ -2117,7 +2117,7 @@ static DEFINE_MUTEX(mf_mutex); * detected by a background scrubber) * * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. + * enabled and no spinlocks held. * * Return: 0 for successfully handled the memory error, * -EOPNOTSUPP for hwpoison_filter() filtered the error event, @@ -2221,7 +2221,7 @@ try_again: * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since * it is called by soft offline too and it is just called - * for !MF_COUNT_INCREASE. So here seems to be the best + * for !MF_COUNT_INCREASED. So here seems to be the best * place. * * Don't need care about the above error handling paths for @@ -2578,10 +2578,10 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) /* * If we succeed to isolate the page, we grabbed another refcount on - * the page, so we can safely drop the one we got from get_any_pages(). + * the page, so we can safely drop the one we got from get_any_page(). * If we failed to isolate the page, it means that we cannot go further * and we will return an error, so drop the reference we got from - * get_any_pages() as well. + * get_any_page() as well. */ put_page(page); return isolated; -- cgit v1.2.3 From a363d1224b5add67a7cafab9fdb9f19d569fbe98 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:15 +0800 Subject: mm: memory-failure: fetch compound head after extra page refcnt is held Page might become thp, huge page or being splited after compound head is fetched but before page refcnt is bumped. So hpage might be a tail page leading to VM_BUG_ON_PAGE(PageTail(page)) in PageTransHuge(). Link: https://lkml.kernel.org/r/20230711055016.2286677-8-linmiaohe@huawei.com Fixes: 415c64c1453a ("mm/memory-failure: split thp earlier in memory error handling") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 36529f3c6554..133737580a7e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2175,8 +2175,6 @@ try_again: goto unlock_mutex; } - hpage = compound_head(p); - /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: @@ -2215,6 +2213,7 @@ try_again: } } + hpage = compound_head(p); if (PageTransHuge(hpage)) { /* * The flag must be set after the refcount is bumped -- cgit v1.2.3 From d31155b8f29ce380f7816e54dee161db6d752909 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:16 +0800 Subject: mm: memory-failure: fix race window when trying to get hugetlb folio page_folio() is fetched before calling get_hwpoison_hugetlb_folio() without hugetlb_lock being held. So hugetlb page could be demoted before get_hwpoison_hugetlb_folio() holding hugetlb_lock but after page_folio() is fetched. So get_hwpoison_hugetlb_folio() will hold unexpected extra refcnt of hugetlb folio while leaving demoted page un-refcnted. Link: https://lkml.kernel.org/r/20230711055016.2286677-9-linmiaohe@huawei.com Fixes: 25182f05ffed ("mm,hwpoison: fix race with hugetlb page allocation") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 133737580a7e..70f44180ef80 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1383,8 +1383,15 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) { + folio_put(folio); + folio = page_folio(page); + } + } /* * This check prevents from calling folio_try_get() for any @@ -1473,8 +1480,13 @@ static int __get_unpoison_page(struct page *page) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) + folio_put(folio); + } /* * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, -- cgit v1.2.3 From 86aa6998ad00af823de81d12d41d7063c14298a0 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 10 Jul 2023 22:35:44 -0700 Subject: mm/memory: pass folio into do_page_mkwrite() Saves one implicit call to compound_head(). I'm not sure if I should change the name of the function to do_folio_mkwrite() and update the description comment to reference a folio as the vm_op is still called page_mkwrite. Link: https://lkml.kernel.org/r/20230711053544.156617-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 36b164ee9ffb..44d11812a88f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2933,10 +2933,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; - struct folio *folio = page_folio(vmf->page); unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; @@ -3298,7 +3297,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - tmp = do_page_mkwrite(vmf); + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); @@ -4621,7 +4620,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); - tmp = do_page_mkwrite(vmf); + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); -- cgit v1.2.3 From a349d72fd9efc87c8fd1d16d3164752d84a7275b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:30:40 -0700 Subject: mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: free retracted page table by RCU", v3. Some mmap_lock avoidance i.e. latency reduction. Initially just for the case of collapsing shmem or file pages to THPs: the usefulness of MADV_COLLAPSE on shmem is being limited by that mmap_write_lock it currently requires. Likely to be relied upon later in other contexts e.g. freeing of empty page tables (but that's not work I'm doing). mmap_write_lock avoidance when collapsing to anon THPs? Perhaps, but again that's not work I've done: a quick attempt was not as easy as the shmem/file case. These changes (though of course not these exact patches) have been in Google's data centre kernel for three years now: we do rely upon them. This patch (of 13): Before putting them to use (several commits later), add rcu_read_lock() to pte_offset_map(), and rcu_read_unlock() to pte_unmap(). Make this a separate commit, since it risks exposing imbalances: prior commits have fixed all the known imbalances, but we may find some have been missed. Link: https://lkml.kernel.org/r/7cd843a9-aa80-14f-5eb2-33427363c20@google.com Link: https://lkml.kernel.org/r/d3b01da5-2a6-833c-6681-67a3e024a16f@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- mm/pgtable-generic.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5063b482e34f..5134edcec668 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -99,7 +99,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ - /* rcu_read_unlock() to be added later */ \ + rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) @@ -108,7 +108,7 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) } static inline void pte_unmap(pte_t *pte) { - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); } #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4d454953046f..400e5a045848 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -236,7 +236,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pmd_t pmdval; - /* rcu_read_lock() to be added later */ + rcu_read_lock(); pmdval = pmdp_get_lockless(pmd); if (pmdvalp) *pmdvalp = pmdval; @@ -250,7 +250,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) } return __pte_map(&pmdval, addr); nomap: - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); return NULL; } -- cgit v1.2.3 From 146b42e07494e45f7c7bcf2cbf7afd1424afd78e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:32:05 -0700 Subject: mm/pgtable: add PAE safety to __pte_offset_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a faint risk that __pte_offset_map(), on a 32-bit architecture with a 64-bit pmd_t e.g. x86-32 with CONFIG_X86_PAE=y, would succeed on a pmdval assembled from a pmd_low and a pmd_high which never belonged together: their combination not pointing to a page table at all, perhaps not even a valid pfn. pmdp_get_lockless() is not enough to prevent that. Guard against that (on such configs) by local_irq_save() blocking TLB flush between present updates, as linux/pgtable.h suggests. It's only needed around the pmdp_get_lockless() in __pte_offset_map(): a race when __pte_offset_map_lock() repeats the pmdp_get_lockless() after getting the lock, would just send it back to __pte_offset_map() again. Complement this pmdp_get_lockless_start() and pmdp_get_lockless_end(), used only locally in __pte_offset_map(), with a pmdp_get_lockless_sync() synonym for tlb_remove_table_sync_one(): to send the necessary interrupt at the right moment on those configs which do not already send it. CONFIG_GUP_GET_PXX_LOW_HIGH is enabled when required by mips, sh and x86. It is not enabled by arm-32 CONFIG_ARM_LPAE: my understanding is that Will Deacon's 2020 enhancements to READ_ONCE() are sufficient for arm. It is not enabled by arc, but its pmd_t is 32-bit even when pte_t 64-bit. Limit the IRQ disablement to CONFIG_HIGHPTE? Perhaps, but would need a little more work, to retry if pmd_low good for page table, but pmd_high non-zero from THP (and that might be making x86-specific assumptions). Link: https://lkml.kernel.org/r/3adcd8f-9191-2df1-d7ea-c4877698aad@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ mm/pgtable-generic.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'mm') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5134edcec668..7f2db400f653 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -390,6 +390,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) return pmd; } #define pmdp_get_lockless pmdp_get_lockless +#define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ @@ -408,6 +409,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } +static inline void pmdp_get_lockless_sync(void) +{ +} #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 400e5a045848..b9a0c2137cc1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -232,12 +232,41 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ + (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) +/* + * See the comment above ptep_get_lockless() in include/linux/pgtable.h: + * the barriers in pmdp_get_lockless() cannot guarantee that the value in + * pmd_high actually belongs with the value in pmd_low; but holding interrupts + * off blocks the TLB flush between present updates, which guarantees that a + * successful __pte_offset_map() points to a page from matched halves. + */ +static unsigned long pmdp_get_lockless_start(void) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + return irqflags; +} +static void pmdp_get_lockless_end(unsigned long irqflags) +{ + local_irq_restore(irqflags); +} +#else +static unsigned long pmdp_get_lockless_start(void) { return 0; } +static void pmdp_get_lockless_end(unsigned long irqflags) { } +#endif + pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { + unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); + irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); + pmdp_get_lockless_end(irqflags); + if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) -- cgit v1.2.3 From 13cf577e6b66a148d6d63f5ef7801f4b61d5850f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:39:48 -0700 Subject: mm/pgtable: add pte_free_defer() for pgtable as page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the generic pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This version suits all those architectures which use an unfragmented page for one page table (none of whose pte_free()s use the mm arg which was passed to it). Link: https://lkml.kernel.org/r/78e921b0-b681-a1b0-dc20-44c9efa4ef3c@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++++ include/linux/pgtable.h | 2 ++ mm/pgtable-generic.c | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 51d04c1847c1..1fc4b9c2c8a6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,10 @@ struct page { struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ + /* + * A PTE page table page might be freed by use of + * rcu_head: which overlays those two fields above. + */ unsigned long _pt_pad_2; /* mapping */ union { struct mm_struct *pt_mm; /* x86 pgds only */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7f2db400f653..9fa34be65159 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -112,6 +112,8 @@ static inline void pte_unmap(pte_t *pte) } #endif +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index b9a0c2137cc1..fa9d4d084291 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -230,6 +231,25 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, return pmd; } #endif + +/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ +#ifndef pte_free_defer +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = pgtable; + call_rcu(&page->rcu_head, pte_free_now); +} +#endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ -- cgit v1.2.3 From 1d65b771bc08cd054cf6d3766a72e113dc46d62f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:41:04 -0700 Subject: mm/khugepaged: retract_page_tables() without mmap or vma lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify shmem and file THP collapse's retract_page_tables(), and relax its locking: to improve its success rate and to lessen impact on others. Instead of its MADV_COLLAPSE case doing set_huge_pmd() at target_addr of target_mm, leave that part of the work to madvise_collapse() calling collapse_pte_mapped_thp() afterwards: just adjust collapse_file()'s result code to arrange for that. That spares retract_page_tables() four arguments; and since it will be successful in retracting all of the page tables expected of it, no need to track and return a result code itself. It needs i_mmap_lock_read(mapping) for traversing the vma interval tree, but it does not need i_mmap_lock_write() for that: page_vma_mapped_walk() allows for pte_offset_map_lock() etc to fail, and uses pmd_lock() for THPs. retract_page_tables() just needs to use those same spinlocks to exclude it briefly, while transitioning pmd from page table to none: so restore its use of pmd_lock() inside of which pte lock is nested. Users of pte_offset_map_lock() etc all now allow for them to fail: so retract_page_tables() now has no use for mmap_write_trylock() or vma_try_start_write(). In common with rmap and page_vma_mapped_walk(), it does not even need the mmap_read_lock(). But those users do expect the page table to remain a good page table, until they unlock and rcu_read_unlock(): so the page table cannot be freed immediately, but rather by the recently added pte_free_defer(). Use the (usually a no-op) pmdp_get_lockless_sync() to send an interrupt when PAE, and pmdp_collapse_flush() did not already do so: to make sure that the start,pmdp_get_lockless(),end sequence in __pte_offset_map() cannot pick up a pmd entry with mismatched pmd_low and pmd_high. retract_page_tables() can be enhanced to replace_page_tables(), which inserts the final huge pmd without mmap lock: going through an invalid state instead of pmd_none() followed by fault. But that enhancement does raise some more questions: leave it until a later release. Link: https://lkml.kernel.org/r/f88970d9-d347-9762-ae6d-da978e8a4df@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 172 +++++++++++++++++++++++--------------------------------- 1 file changed, 69 insertions(+), 103 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e707da4a83c..8f88fd6d781d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1617,9 +1617,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, break; case SCAN_PMD_NONE: /* - * In MADV_COLLAPSE path, possible race with khugepaged where - * all pte entries have been removed and pmd cleared. If so, - * skip all the pte checks and just update the pmd mapping. + * All pte entries have been removed and pmd cleared. + * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: @@ -1750,123 +1749,88 @@ out: mmap_write_unlock(mm); } -static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, - struct mm_struct *target_mm, - unsigned long target_addr, struct page *hpage, - struct collapse_control *cc) +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; - int target_result = SCAN_FAIL; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - int result = SCAN_FAIL; - struct mm_struct *mm = NULL; - unsigned long addr = 0; - pmd_t *pmd; - bool is_target = false; + struct mmu_notifier_range range; + struct mm_struct *mm; + unsigned long addr; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml; + spinlock_t *ptl; + bool skipped_uffd = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth investing - * mmap_write_lock(mm) as PMD-mapping is likely to be split - * later. - * - * Note that vma->anon_vma check is racy: it can be set up after - * the check but before we took mmap_lock by the fault path. - * But page lock would prevent establishing any new ptes of the - * page, so we are safe. - * - * An alternative would be drop the check, but check that page - * table is clear before calling pmdp_collapse_flush() under - * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. It would also probably require locking - * the anon_vma. + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. */ - if (READ_ONCE(vma->anon_vma)) { - result = SCAN_PAGE_ANON; - goto next; - } + if (READ_ONCE(vma->anon_vma)) + continue; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || - vma->vm_end < addr + HPAGE_PMD_SIZE) { - result = SCAN_VMA_CHECK; - goto next; - } + vma->vm_end < addr + HPAGE_PMD_SIZE) + continue; + mm = vma->vm_mm; - is_target = mm == target_mm && addr == target_addr; - result = find_pmd_or_thp_or_none(mm, addr, &pmd); - if (result != SCAN_SUCCEED) - goto next; + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) + continue; + + if (hpage_collapse_test_exit(mm)) + continue; /* - * We need exclusive mmap_lock to retract page table. - * - * We use trylock due to lock inversion: we need to acquire - * mmap_lock while holding page lock. Fault path does it in - * reverse order. Trylock is a way to avoid deadlock. - * - * Also, it's not MADV_COLLAPSE's job to collapse other - * mappings - let khugepaged take care of them later. + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. */ - result = SCAN_PTE_MAPPED_HUGEPAGE; - if ((cc->is_khugepaged || is_target) && - mmap_write_trylock(mm)) { - /* trylock for the same lock inversion as above */ - if (!vma_try_start_write(vma)) - goto unlock_next; + if (userfaultfd_wp(vma)) + continue; - /* - * Re-check whether we have an ->anon_vma, because - * collapse_and_free_pmd() requires that either no - * ->anon_vma exists or the anon_vma is locked. - * We already checked ->anon_vma above, but that check - * is racy because ->anon_vma can be populated under the - * mmap lock in read mode. - */ - if (vma->anon_vma) { - result = SCAN_PAGE_ANON; - goto unlock_next; - } - /* - * When a vma is registered with uffd-wp, we can't - * recycle the pmd pgtable because there can be pte - * markers installed. Skip it only, so the rest mm/vma - * can still have the same file mapped hugely, however - * it'll always mapped in small page size for uffd-wp - * registered ranges. - */ - if (hpage_collapse_test_exit(mm)) { - result = SCAN_ANY_PROCESS; - goto unlock_next; - } - if (userfaultfd_wp(vma)) { - result = SCAN_PTE_UFFD_WP; - goto unlock_next; - } - collapse_and_free_pmd(mm, vma, addr, pmd); - if (!cc->is_khugepaged && is_target) - result = set_huge_pmd(vma, addr, pmd, hpage); - else - result = SCAN_SUCCEED; + /* PTEs were notified when unmapped; but now for the PMD? */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + addr, addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + pml = pmd_lock(mm, pmd); + ptl = pte_lockptr(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); -unlock_next: - mmap_write_unlock(mm); - goto next; - } /* - * Calling context will handle target mm/addr. Otherwise, let - * khugepaged try again later. + * Huge page lock is still held, so normally the page table + * must remain empty; and we have already skipped anon_vma + * and userfaultfd_wp() vmas. But since the mmap_lock is not + * held, it is still possible for a racing userfaultfd_ioctl() + * to have inserted ptes or markers. Now that we hold ptlock, + * repeating the anon_vma check protects from one category, + * and repeating the userfaultfd_wp() check from another. */ - if (!is_target) { - khugepaged_add_pte_mapped_thp(mm, addr); - continue; + if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { + skipped_uffd = true; + } else { + pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); + pmdp_get_lockless_sync(); + } + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + mmu_notifier_invalidate_range_end(&range); + + if (!skipped_uffd) { + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } -next: - if (is_target) - target_result = result; } - i_mmap_unlock_write(mapping); - return target_result; + i_mmap_unlock_read(mapping); } /** @@ -2260,9 +2224,11 @@ immap_locked: /* * Remove pte page tables, so we can re-fault the page as huge. + * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ - result = retract_page_tables(mapping, start, mm, addr, hpage, - cc); + retract_page_tables(mapping, start); + if (cc && !cc->is_khugepaged) + result = SCAN_PTE_MAPPED_HUGEPAGE; unlock_page(hpage); /* -- cgit v1.2.3 From 1043173eb5eb351a1dba11cca12705075fe74a9e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:42:19 -0700 Subject: mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp(). It does need mmap_read_lock(), but it does not need mmap_write_lock(), nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing paths are relying on pte_offset_map_lock() and pmd_lock(), so use those. Follow the pattern in retract_page_tables(); and using pte_free_defer() removes most of the need for tlb_remove_table_sync_one() here; but call pmdp_get_lockless_sync() to use it in the PAE case. First check the VMA, in case page tables are being torn down: from JannH. Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been acquired and the page looks suitable: from then on its state is stable. However, collapse_pte_mapped_thp() was doing something others don't: freeing a page table still containing "valid" entries. i_mmap lock did stop a racing truncate from double-freeing those pages, but we prefer collapse_pte_mapped_thp() to clear the entries as usual. Their TLB flush can wait until the pmdp_collapse_flush() which follows, but the mmu_notifier_invalidate_range_start() has to be done earlier. Do the "step 1" checking loop without mmu_notifier: it wouldn't be good for khugepaged to keep on repeatedly invalidating a range which is then found unsuitable e.g. contains COWs. "step 2", which does the clearing, must then be more careful (after dropping ptl to do mmu_notifier), with abort prepared to correct the accounting like "step 3". But with those entries now cleared, "step 4" (after dropping ptl to do pmd_lock) is kept safe by the huge page lock, which stops new PTEs from being faulted in. [hughd@google.com: don't set mmap_locked = true in madvise_collapse()] Link: https://lkml.kernel.org/r/d3d9ff14-ef8-8f84-e160-bfa1f5794275@google.com [hughd@google.com: use ptep_clear() instead of pte_clear()] Link: https://lkml.kernel.org/r/e0197433-8a47-6a65-534d-eda26eeb78b0@google.com Link: https://lkml.kernel.org/r/b53be6a4-7715-51f9-aad-f1347dcb7c4@google.com Signed-off-by: Hugh Dickins Reviewed-by: Qi Zheng Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 170 +++++++++++++++++++++++++------------------------------- 1 file changed, 76 insertions(+), 94 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8f88fd6d781d..53d1788332cb 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return ret; } -/* hpage must be locked, and mmap_lock must be held in write */ +/* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) { @@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, }; VM_BUG_ON(!PageTransHuge(hpage)); - mmap_assert_write_locked(vma->vm_mm); + mmap_assert_locked(vma->vm_mm); if (do_set_pmd(&vmf, hpage)) return SCAN_FAIL; @@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -/* - * A note about locking: - * Trying to take the page table spinlocks would be useless here because those - * are only used to synchronize: - * - * - modifying terminal entries (ones that point to a data page, not to another - * page table) - * - installing *new* non-terminal entries - * - * Instead, we need roughly the same kind of protection as free_pgtables() or - * mm_take_all_locks() (but only for a single VMA): - * The mmap lock together with this VMA's rmap locks covers all paths towards - * the page table entries we're messing with here, except for hardware page - * table walks and lockless_pages_from_mm(). - */ -static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - struct mmu_notifier_range range; - - mmap_assert_write_locked(mm); - if (vma->vm_file) - lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); - /* - * All anon_vmas attached to the VMA have the same root and are - * therefore locked by the same lock. - */ - if (vma->anon_vma) - lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - pmd = pmdp_collapse_flush(vma, addr, pmdp); - tlb_remove_table_sync_one(); - mmu_notifier_invalidate_range_end(&range); - mm_dec_nr_ptes(mm); - page_table_check_pte_clear_range(mm, addr, pmd); - pte_free(mm, pmd_pgtable(pmd)); -} - /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1563,26 +1521,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + struct mmu_notifier_range range; + bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd; - spinlock_t *ptl; - int count = 0, result = SCAN_FAIL; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml, *ptl; + int nr_ptes = 0, result = SCAN_FAIL; int i; - mmap_assert_write_locked(mm); + mmap_assert_locked(mm); + + /* First check VMA found, in case page tables are being torn down */ + if (!vma || !vma->vm_file || + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) + return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; - if (!vma || !vma->vm_file || - !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return SCAN_VMA_CHECK; - /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in @@ -1612,6 +1573,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; @@ -1625,27 +1587,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } - /* Lock the vma before taking i_mmap and page table locks */ - vma_start_write(vma); - - /* - * We need to lock the mapping so that from here on, only GUP-fast and - * hardware page walks can access the parts of the page tables that - * we're operating on. - * See collapse_and_free_pmd(). - */ - i_mmap_lock_write(vma->vm_file->f_mapping); - - /* - * This spinlock should be unnecessary: Nobody else should be accessing - * the page tables under spinlock protection here, only - * lockless_pages_from_mm() and the hardware page walker can access page - * tables while all the high-level locks are held in write mode. - */ result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); - if (!start_pte) - goto drop_immap; + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto drop_hpage; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1672,10 +1617,18 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, */ if (hpage + i != page) goto abort; - count++; } - /* step 2: adjust rmap */ + pte_unmap_unlock(start_pte, ptl); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + haddr, haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + notified = true; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto abort; + + /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; @@ -1683,47 +1636,76 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (pte_none(ptent)) continue; + /* + * We dropped ptl after the first scan, to do the mmu_notifier: + * page lock stops more PTEs of the hpage being faulted in, but + * does not stop write faults COWing anon copies from existing + * PTEs; and does not stop those being swapped out or migrated. + */ + if (!pte_present(ptent)) { + result = SCAN_PTE_NON_PRESENT; + goto abort; + } page = vm_normal_page(vma, addr, ptent); - if (WARN_ON_ONCE(page && is_zone_device_page(page))) + if (hpage + i != page) goto abort; + + /* + * Must clear entry, or a racing truncate may re-remove it. + * TLB flush can be left until pmdp_collapse_flush() does it. + * PTE dirty? Shmem page is already dirty; file is read-only. + */ + ptep_clear(mm, addr, pte); page_remove_rmap(page, vma, false); + nr_ptes++; } pte_unmap_unlock(start_pte, ptl); /* step 3: set proper refcount and mm_counters. */ - if (count) { - page_ref_sub(hpage, count); - add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + if (nr_ptes) { + page_ref_sub(hpage, nr_ptes); + add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); } - /* step 4: remove pte entries */ - /* we make no change to anon, but protect concurrent anon page lookup */ - if (vma->anon_vma) - anon_vma_lock_write(vma->anon_vma); + /* step 4: remove page table */ - collapse_and_free_pmd(mm, vma, haddr, pmd); + /* Huge page lock is still held, so page table must remain empty */ + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); + pmdp_get_lockless_sync(); + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); - if (vma->anon_vma) - anon_vma_unlock_write(vma->anon_vma); - i_mmap_unlock_write(vma->vm_file->f_mapping); + mmu_notifier_invalidate_range_end(&range); + + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, haddr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, hpage) : SCAN_SUCCEED; - + goto drop_hpage; +abort: + if (nr_ptes) { + flush_tlb_mm(mm); + page_ref_sub(hpage, nr_ptes); + add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + } + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (notified) + mmu_notifier_invalidate_range_end(&range); drop_hpage: unlock_page(hpage); put_page(hpage); return result; - -abort: - pte_unmap_unlock(start_pte, ptl); -drop_immap: - i_mmap_unlock_write(vma->vm_file->f_mapping); - goto drop_hpage; } static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) @@ -2856,9 +2838,9 @@ handle_result: case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); BUG_ON(*prev); - mmap_write_lock(mm); + mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); - mmap_write_unlock(mm); + mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: -- cgit v1.2.3 From d50791c2bee9ed97b1dd81db9bbb11caddcdfb0d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:43:36 -0700 Subject: mm/khugepaged: delete khugepaged_collapse_pte_mapped_thps() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that retract_page_tables() can retract page tables reliably, without depending on trylocks, delete all the apparatus for khugepaged to try again later: khugepaged_collapse_pte_mapped_thps() etc; and free up the per-mm memory which was set aside for that in the khugepaged_mm_slot. But one part of that is worth keeping: when hpage_collapse_scan_file() found SCAN_PTE_MAPPED_HUGEPAGE, that address was noted in the mm_slot to be tried for retraction later - catching, for example, page tables where a reversible mprotect() of a portion had required splitting the pmd, but now it can be recollapsed. Call collapse_pte_mapped_thp() directly in this case (why was it deferred before? I assume an issue with needing mmap_lock for write, but now it's only needed for read). [hughd@google.com: fix mmap_locked handlng] Link: https://lkml.kernel.org/r/bfc6cab2-497f-32bf-dd5-98dc1987e4a9@google.com Link: https://lkml.kernel.org/r/a5dce57-6dfa-5559-4698-e817eb2f993@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 125 +++++++------------------------------------------------- 1 file changed, 15 insertions(+), 110 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 53d1788332cb..9a6e0d507759 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -93,8 +93,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; -#define MAX_PTE_MAPPED_THP 8 - struct collapse_control { bool is_khugepaged; @@ -108,15 +106,9 @@ struct collapse_control { /** * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned * @slot: hash lookup from mm to mm_slot - * @nr_pte_mapped_thp: number of pte mapped THP - * @pte_mapped_thp: address array corresponding pte mapped THP */ struct khugepaged_mm_slot { struct mm_slot slot; - - /* pte-mapped THP in this mm */ - int nr_pte_mapped_thp; - unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -1441,50 +1433,6 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } #ifdef CONFIG_SHMEM -/* - * Notify khugepaged that given addr of the mm is pte-mapped THP. Then - * khugepaged should try to collapse the page table. - * - * Note that following race exists: - * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, - * emptying the A's ->pte_mapped_thp[] array. - * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and - * retract_page_tables() finds a VMA in mm_struct A mapping the same extent - * (at virtual address X) and adds an entry (for X) into mm_struct A's - * ->pte-mapped_thp[] array. - * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, - * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry - * (for X) into mm_struct A's ->pte-mapped_thp[] array. - * Thus, it's possible the same address is added multiple times for the same - * mm_struct. Should this happen, we'll simply attempt - * collapse_pte_mapped_thp() multiple times for the same address, under the same - * exclusive mmap_lock, and assuming the first call is successful, subsequent - * attempts will return quickly (without grabbing any additional locks) when - * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap - * check, and since this is a rare occurrence, the cost of preventing this - * "multiple-add" is thought to be more expensive than just handling it, should - * it occur. - */ -static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) -{ - struct khugepaged_mm_slot *mm_slot; - struct mm_slot *slot; - bool ret = false; - - VM_BUG_ON(addr & ~HPAGE_PMD_MASK); - - spin_lock(&khugepaged_mm_lock); - slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { - mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; - ret = true; - } - spin_unlock(&khugepaged_mm_lock); - return ret; -} - /* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) @@ -1708,29 +1656,6 @@ drop_hpage: return result; } -static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) -{ - struct mm_slot *slot = &mm_slot->slot; - struct mm_struct *mm = slot->mm; - int i; - - if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return; - - if (!mmap_write_trylock(mm)) - return; - - if (unlikely(hpage_collapse_test_exit(mm))) - goto out; - - for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); - -out: - mm_slot->nr_pte_mapped_thp = 0; - mmap_write_unlock(mm); -} - static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -2371,16 +2296,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, { BUILD_BUG(); } - -static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) -{ -} - -static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) -{ - return false; -} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, @@ -2410,7 +2325,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); - khugepaged_collapse_pte_mapped_thps(mm_slot); mm = slot->mm; /* @@ -2463,36 +2377,27 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - *result = hpage_collapse_scan_file(mm, - khugepaged_scan.address, - file, pgoff, cc); mmap_locked = false; + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, file, pgoff, cc); fput(file); + if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { + mmap_read_lock(mm); + if (hpage_collapse_test_exit(mm)) + goto breakouterloop; + *result = collapse_pte_mapped_thp(mm, + khugepaged_scan.address, false); + if (*result == SCAN_PMD_MAPPED) + *result = SCAN_SUCCEED; + mmap_read_unlock(mm); + } } else { *result = hpage_collapse_scan_pmd(mm, vma, - khugepaged_scan.address, - &mmap_locked, - cc); + khugepaged_scan.address, &mmap_locked, cc); } - switch (*result) { - case SCAN_PTE_MAPPED_HUGEPAGE: { - pmd_t *pmd; - - *result = find_pmd_or_thp_or_none(mm, - khugepaged_scan.address, - &pmd); - if (*result != SCAN_SUCCEED) - break; - if (!khugepaged_add_pte_mapped_thp(mm, - khugepaged_scan.address)) - break; - } fallthrough; - case SCAN_SUCCEED: + + if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; - break; - default: - break; - } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; -- cgit v1.2.3 From 610d06576737f5401647a4aab46558c1114898fb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:46:23 -0700 Subject: mm/pgtable: notes on pte_offset_map[_lock]() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a block of comments on pte_offset_map_lock(), pte_offset_map() and pte_offset_map_nolock() to mm/pgtable-generic.c, to help explain them. Link: https://lkml.kernel.org/r/b791c3b0-25c6-a263-d785-d564344eb644@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/pgtable-generic.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index fa9d4d084291..4fcd959dcc4d 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -315,6 +315,50 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +/* + * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation + * __pte_offset_map_lock() below, is usually called with the pmd pointer for + * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while + * holding mmap_lock or vma lock for read or for write; or in truncate or rmap + * context, while holding file's i_mmap_lock or anon_vma lock for read (or for + * write). In a few cases, it may be used with pmd pointing to a pmd_t already + * copied to or constructed on the stack. + * + * When successful, it returns the pte pointer for addr, with its page table + * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent + * modification by software, with a pointer to that spinlock in ptlp (in some + * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's + * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. + * + * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no + * page table at *pmd: if, for example, the page table has just been removed, + * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked + * after acquiring the ptlock, and retried internally if it changed: so that a + * page table can be safely removed or replaced by THP while holding its lock.) + * + * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, + * just returns the pte pointer for addr, its page table kmapped if necessary; + * or NULL if there is no page table at *pmd. It does not attempt to lock the + * page table, so cannot normally be used when the page table is to be updated, + * or when entries read must be stable. But it does take rcu_read_lock(): so + * that even when page table is racily removed, it remains a valid though empty + * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s + * afterwards. + * + * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * but when successful, it also outputs a pointer to the spinlock in ptlp - as + * pte_offset_map_lock() does, but in this case without locking it. This helps + * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time + * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock + * pointer for the page table that it returns. In principle, the caller should + * recheck *pmd once the lock is taken; in practice, no callsite needs that - + * either the mmap_lock for write, or pte_same() check on contents, is enough. + * + * Note that free_pgtables(), used after unmapping detached vmas, or when + * exiting the whole mm, does not take page table lock before freeing a page + * table, and may not use RCU at all: "outsiders" like khugepaged should avoid + * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. + */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { -- cgit v1.2.3 From 6852c46c783d20a4c0153d14d2990040e5e6e47e Mon Sep 17 00:00:00 2001 From: Yu Ma Date: Wed, 12 Jul 2023 10:57:39 -0400 Subject: mm/mmap: move vma operations to mm_struct out of the critical section of file mapping lock UnixBench/Execl represents a class of workload where bash scripts are spawned frequently to do some short jobs. When running multiple parallel tasks, hot osq_lock is observed from do_mmap and exit_mmap. Both of them come from load_elf_binary through the call chain "execl->do_execveat_common->bprm_execve->load_elf_binary". In do_mmap,it will call mmap_region to create vma node, initialize it and insert it to vma maintain structure in mm_struct and i_mmap tree of the mapping file, then increase map_count to record the number of vma nodes used. The hot osq_lock is to protect operations on file's i_mmap tree. For the mm_struct member change like vma insertion and map_count update, they do not affect i_mmap tree. Move those operations out of the lock's critical section, to reduce hold time on the lock. With this change, on Intel Sapphire Rapids 112C/224T platform, based on v6.0-rc6, the 160 parallel score improves by 12%. The patch has no obvious performance gain on v6.5-rc1 due to regression of this benchmark from this commit f1a7941243c102a44e8847e3b94ff4ff3ec56f25 (mm: convert mm's rss stats into percpu_counter). Related discussion and conclusion can be referred at the mail thread initiated by 0day as below: Link: https://lore.kernel.org/linux-mm/a4aa2e13-7187-600b-c628-7e8fb108def0@intel.com/ Link: https://lkml.kernel.org/r/20230712145739.604215-1-yu.ma@intel.com Signed-off-by: Yu Ma Reviewed-by: Tim Chen Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A . Shutemov Cc: Liam R. Howlett Cc: Shakeel Butt Cc: Zhu, Lipeng Signed-off-by: Andrew Morton --- mm/mmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3937479d0e07..e1586b2f938e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -412,14 +412,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) if (vma_iter_prealloc(&vmi)) return -ENOMEM; + vma_iter_store(&vmi, vma); + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); - } - - vma_iter_store(&vmi, vma); - - if (mapping) { __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } @@ -2812,12 +2809,10 @@ cannot_expand: /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { + i_mmap_lock_write(vma->vm_file->f_mapping); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(vma->vm_file->f_mapping); -- cgit v1.2.3 From 9e130c4b000b0a3f0bf4b4c8e714bfe3d06ff4cc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:30 +0000 Subject: mm/hwpoison: delete all entries before traversal in __folio_free_raw_hwp Patch series "Improve hugetlbfs read on HWPOISON hugepages", v4. Today when hardware memory is corrupted in a hugetlb hugepage, kernel leaves the hugepage in pagecache [1]; otherwise future mmap or read will suject to silent data corruption. This is implemented by returning -EIO from hugetlb_read_iter immediately if the hugepage has HWPOISON flag set. Since memory_failure already tracks the raw HWPOISON subpages in a hugepage, a natural improvement is possible: if userspace only asks for healthy subpages in the pagecache, kernel can return these data. This patchset implements this improvement. It consist of three parts. The 1st commit exports the functionality to tell if a subpage inside a hugetlb hugepage is a raw HWPOISON page. The 2nd commit teaches hugetlbfs_read_iter to return as many healthy bytes as possible. The 3rd commit properly tests this new feature. [1] commit 8625147cafaa ("hugetlbfs: don't delete error page from pagecache") This patch (of 4): Traversal on llist (e.g. llist_for_each_safe) is only safe AFTER entries are deleted from the llist. Correct the way __folio_free_raw_hwp deletes and frees raw_hwp_page entries in raw_hwp_list: first llist_del_all, then kfree within llist_for_each_safe. As of today, concurrent adding, deleting, and traversal on raw_hwp_list from hugetlb.c and/or memory-failure.c are fine with each other. Note this is guaranteed partly by the lock-free nature of llist, and partly by holding hugetlb_lock and/or mf_mutex. For example, as llist_del_all is lock-free with itself, folio_clear_hugetlb_hwpoison()s from __update_and_free_hugetlb_folio and memory_failure won't need explicit locking when freeing the raw_hwp_list. New code that manipulates raw_hwp_list must be careful to ensure the concurrency correctness. Link: https://lkml.kernel.org/r/20230713001833.3778937-1-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20230713001833.3778937-2-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 70f44180ef80..9422a5770db6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1830,12 +1830,11 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { - struct llist_head *head; - struct llist_node *t, *tnode; + struct llist_node *t, *tnode, *head; unsigned long count = 0; - head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { + head = llist_del_all(raw_hwp_list_head(folio)); + llist_for_each_safe(tnode, t, head) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); if (move_flag) @@ -1845,7 +1844,6 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) kfree(p); count++; } - llist_del_all(head); return count; } -- cgit v1.2.3 From b79f8eb408d0468df0d6082ed958b67d94adce65 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:31 +0000 Subject: mm/hwpoison: check if a raw page in a hugetlb folio is raw HWPOISON Add the functionality, is_raw_hwpoison_page_in_hugepage, to tell if a raw page in a hugetlb folio is HWPOISON. This functionality relies on RawHwpUnreliable to be not set; otherwise hugepage's raw HWPOISON list becomes meaningless. is_raw_hwpoison_page_in_hugepage holds mf_mutex in order to synchronize with folio_set_hugetlb_hwpoison and folio_free_raw_hwp who iterate, insert, or delete entry in raw_hwp_list. llist itself doesn't ensure insertion and removal are synchornized with the llist_for_each_entry used by is_raw_hwpoison_page_in_hugepage (unless iterated entries are already deleted from the list). Caller can minimize the overhead of lock cycles by first checking HWPOISON flag of the folio. Exports this functionality to be immediately used in the read operation for hugetlbfs. Link: https://lkml.kernel.org/r/20230713001833.3778937-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: James Houghton Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ mm/memory-failure.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9bc3c2d71b71..9f4bac3df59e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -997,6 +997,11 @@ void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif +/* + * Check if a given raw @page in a hugepage is HWPOISON. + */ +bool is_raw_hwpoison_page_in_hugepage(struct page *page); + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9422a5770db6..adb0dacbc74e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -72,6 +72,8 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +static DEFINE_MUTEX(mf_mutex); + void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); @@ -1814,6 +1816,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ #ifdef CONFIG_HUGETLB_PAGE + /* * Struct raw_hwp_page represents information about "raw error page", * constructing singly linked list from ->_hugetlb_hwpoison field of folio. @@ -1828,6 +1831,41 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) return (struct llist_head *)&folio->_hugetlb_hwpoison; } +bool is_raw_hwpoison_page_in_hugepage(struct page *page) +{ + struct llist_head *raw_hwp_head; + struct raw_hwp_page *p; + struct folio *folio = page_folio(page); + bool ret = false; + + if (!folio_test_hwpoison(folio)) + return false; + + if (!folio_test_hugetlb(folio)) + return PageHWPoison(page); + + /* + * When RawHwpUnreliable is set, kernel lost track of which subpages + * are HWPOISON. So return as if ALL subpages are HWPOISONed. + */ + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) + return true; + + mutex_lock(&mf_mutex); + + raw_hwp_head = raw_hwp_list_head(folio); + llist_for_each_entry(p, raw_hwp_head->first, node) { + if (page == p->page) { + ret = true; + break; + } + } + + mutex_unlock(&mf_mutex); + + return ret; +} + static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { struct llist_node *t, *tnode, *head; @@ -2110,8 +2148,6 @@ out: return rc; } -static DEFINE_MUTEX(mf_mutex); - /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page -- cgit v1.2.3 From 5ba72b4d063520b1bbe00f78dcdb726d486f96d6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 20:05:57 +0800 Subject: mm/huge_memory: use RMAP_NONE when calling page_add_anon_rmap() It's more convenient and readable to use RMAP_NONE instead of false when calling page_add_anon_rmap(). No functional change intended. Link: https://lkml.kernel.org/r/20230713120557.218592-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f3109ed7351..762be2f4244c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2255,7 +2255,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); - page_add_anon_rmap(page + i, vma, addr, false); + page_add_anon_rmap(page + i, vma, addr, RMAP_NONE); } VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); -- cgit v1.2.3 From f4d005af5b5499ddc71bbe704f2a6afa06dd3788 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 20:14:32 +0800 Subject: mm/memcg: fix obsolete comment above MEM_CGROUP_MAX_RECLAIM_LOOPS Since commit 5660048ccac8 ("mm: move memcg hierarchy reclaim to generic reclaim code"), mem_cgroup_hierarchical_reclaim() is already renamed to mem_cgroup_soft_reclaim(). Update the corresponding comment. Link: https://lkml.kernel.org/r/20230713121432.273381-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eaeb69ef9f5..93e3cc581b51 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -197,7 +197,7 @@ static struct move_charge_struct { }; /* - * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * Maximum loops in mem_cgroup_soft_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 -- cgit v1.2.3 From 34c876ce5eeda6c7aa89b2068e724cf84f409ebb Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:29 +0800 Subject: mm/page_table_check: remove unused parameters in page_table_check_clear() Patch series "Remove unused parameters in page_table_check". This series remove unused parameters in functions from page_table_check. The first 2 patches remove unused mm and addr parameters in static common functions page_table_check_clear and page_table_check_set. The last 6 patches remove unused addr parameter in some externed functions which only need addr for cleaned page_table_check_clear or page_table_check_set. There is no intended functional change. This patch (of 8): Remove unused mm and addr in function page_table_check_clear(). Link: https://lkml.kernel.org/r/20230713172636.1705415-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230713172636.1705415-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 93ec7690a0d8..9477b93d8463 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -58,8 +58,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ -static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, - unsigned long pfn, unsigned long pgcnt) +static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext *page_ext; struct page *page; @@ -158,8 +157,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, return; if (pte_user_accessible_page(pte)) { - page_table_check_clear(mm, addr, pte_pfn(pte), - PAGE_SIZE >> PAGE_SHIFT); + page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); @@ -171,8 +169,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, return; if (pmd_user_accessible_page(pmd)) { - page_table_check_clear(mm, addr, pmd_pfn(pmd), - PMD_SIZE >> PAGE_SHIFT); + page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@ -184,8 +181,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, return; if (pud_user_accessible_page(pud)) { - page_table_check_clear(mm, addr, pud_pfn(pud), - PUD_SIZE >> PAGE_SHIFT); + page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); -- cgit v1.2.3 From 2f933eaf5bbf49b71319549464df44b87074a8ac Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:30 +0800 Subject: mm/page_table_check: remove unused parameters in page_table_check_set() Remove unused mm and addr in page_table_check_set(). Link: https://lkml.kernel.org/r/20230713172636.1705415-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 9477b93d8463..53a9a1e4f342 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -94,8 +94,7 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) * verify that it is of correct type and is not being mapped with a different * type to a different process. */ -static void page_table_check_set(struct mm_struct *mm, unsigned long addr, - unsigned long pfn, unsigned long pgcnt, +static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext *page_ext; @@ -194,8 +193,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { - page_table_check_set(mm, addr, pte_pfn(pte), - PAGE_SIZE >> PAGE_SHIFT, + page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, pte_write(pte)); } } @@ -209,8 +207,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { - page_table_check_set(mm, addr, pmd_pfn(pmd), - PMD_SIZE >> PAGE_SHIFT, + page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } @@ -224,8 +221,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { - page_table_check_set(mm, addr, pud_pfn(pud), - PUD_SIZE >> PAGE_SHIFT, + page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } -- cgit v1.2.3 From aa232204c4689427cefa55fe975692b57291523a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:31 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear Remove unused addr in page_table_check_pte_clear and __page_table_check_pte_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 7 +++---- 6 files changed, 12 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e8a252e62b12..f7ea51f9c1c1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -928,7 +928,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 75970ee2bda2..5e07312cd3e1 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -529,7 +529,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb337987..5085e838b860 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1068,7 +1068,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1084,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 01e16c7696ec..35c53c4b94d3 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,8 +14,7 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -46,13 +45,12 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, addr, pte); + __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -123,8 +121,7 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9fa34be65159..a1ccb13c4853 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -322,7 +322,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } #endif diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 53a9a1e4f342..a1015fc4d045 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -149,8 +149,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) page_ext_put(page_ext); } -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; @@ -191,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, pte_write(pte)); @@ -241,7 +240,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } -- cgit v1.2.3 From 1831414cd729a34af937d56ad684a66599de6344 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:32 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear Remove unused addr in page_table_check_pmd_clear and __page_table_check_pmd_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 ++--- 6 files changed, 10 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7ea51f9c1c1..6e3387ec6013 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -940,7 +940,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5e07312cd3e1..388c3af8a9f9 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -742,7 +742,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5085e838b860..5d71f933d933 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1133,7 +1133,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 35c53c4b94d3..0f777bca5283 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,8 +15,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, @@ -53,13 +52,12 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, addr, pmd); + __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -125,8 +123,7 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a1ccb13c4853..3edef5ed008f 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,7 +425,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index a1015fc4d045..51f2274c0a20 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -160,8 +160,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; @@ -204,7 +203,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pmd_clear(mm, addr, *pmdp); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); -- cgit v1.2.3 From 931c38e16499a057e30a3033f4d6a9c242f0f156 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:33 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear Remove unused addr in __page_table_check_pud_clear and page_table_check_pud_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5d71f933d933..f07c610c3458 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1144,7 +1144,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 0f777bca5283..5c9dc848a1bc 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,8 +16,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, - pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, @@ -60,13 +59,12 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, addr, pud); + __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_pte_set(struct mm_struct *mm, @@ -127,8 +125,7 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3edef5ed008f..5f36c055794b 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -438,7 +438,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, address, pud); + page_table_check_pud_clear(mm, pud); return pud; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 51f2274c0a20..2643135bf45c 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -171,8 +171,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, - pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; @@ -217,7 +216,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pud_clear(mm, addr, *pudp); + __page_table_check_pud_clear(mm, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); -- cgit v1.2.3 From 1066293d426d3000793c3c3b4276ef38b63ada4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:34 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pte_set Remove unused addr in __page_table_check_pte_set and page_table_check_pte_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6e3387ec6013..f0a8dcbca04a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 388c3af8a9f9..90063afe8d36 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, addr, ptep, pteval); + page_table_check_pte_set(mm, ptep, pteval); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f07c610c3458..d14f0d92f04b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 5c9dc848a1bc..63ebd9fcf28b 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,7 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, @@ -67,14 +66,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, addr, ptep, pte); + __page_table_check_pte_set(mm, ptep, pte); } static inline void page_table_check_pmd_set(struct mm_struct *mm, @@ -129,8 +127,7 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2643135bf45c..fc20ddc3a63e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,8 +182,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (&init_mm == mm) return; -- cgit v1.2.3 From a3b837130b5865521fa8662aceaa6ebc8d29389a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:35 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set Remove unused addr in __page_table_check_pmd_set and page_table_check_pmd_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 11 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f0a8dcbca04a..1fbf8d3f42b1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -524,7 +524,7 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -976,7 +976,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 90063afe8d36..a30658b2611b 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -687,7 +687,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -758,7 +758,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d14f0d92f04b..9cc26cb0bc9f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1030,7 +1030,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1167,7 +1167,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 63ebd9fcf28b..dd58dfb0e643 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -18,8 +18,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -75,14 +74,13 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, __page_table_check_pte_set(mm, ptep, pte); } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_set(mm, addr, pmdp, pmd); + __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, @@ -132,8 +130,7 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, { } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index fc20ddc3a63e..033956704a64 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -195,8 +195,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_set); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; -- cgit v1.2.3 From 6d144436d954311f2dbacb5bf7b084042448d83e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:36 +0800 Subject: mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set Remove unused addr in __page_table_check_pud_set and page_table_check_pud_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1fbf8d3f42b1..fe4b913589ee 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -531,7 +531,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a30658b2611b..44377f0d7c35 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -694,7 +694,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9cc26cb0bc9f..ada1bbf12961 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1037,7 +1037,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index dd58dfb0e643..7f6b9bf926c5 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud); +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -83,14 +82,13 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, __page_table_check_pmd_set(mm, pmdp, pmd); } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_set(mm, addr, pudp, pud); + __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -135,8 +133,7 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, { } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 033956704a64..84c8163984e5 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -208,8 +208,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_set); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; -- cgit v1.2.3 From d5db4f9df9397d398256a2e33ad63c39c213b990 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:09 +0100 Subject: migrate: use folio_set_bh() instead of set_bh_page() This function was converted before folio_set_bh() existed. Catch up to the new API. Link: https://lkml.kernel.org/r/20230713035512.4139457-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index e9821e245e70..e21d5a7e7447 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -773,7 +773,7 @@ recheck_buffers: bh = head; do { - set_bh_page(bh, &dst->page, bh_offset(bh)); + folio_set_bh(bh, dst, bh_offset(bh)); bh = bh->b_this_page; } while (bh != head); -- cgit v1.2.3 From 063ff7cd8bf24aa14c897b6168591d3d0dae2a5e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:47 +0800 Subject: mm/page_ext: remove unused return value of offline_page_ext Patch series "minor cleanups for page_ext". This series contains some random minor cleanups for page_ext. More details can be found in respective patches. This patch (of 3): offline_page_ext always returns 0 and no caller checks the return value. Just remove unused return value of offline_page_ext. Link: https://lkml.kernel.org/r/20230714114749.1743032-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230714114749.1743032-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index dc1626be458b..096451df1c87 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -430,7 +430,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, return -ENOMEM; } -static int __meminit offline_page_ext(unsigned long start_pfn, +static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; @@ -454,8 +454,6 @@ static int __meminit offline_page_ext(unsigned long start_pfn, for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); - return 0; - } static int __meminit page_ext_callback(struct notifier_block *self, -- cgit v1.2.3 From 3c09be5a2be861d7f74b0251a8e77859b4c654cc Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:48 +0800 Subject: mm/page_ext: remove rollback for untouched mem_section in online_page_ext If init_section_page_ext failed, we only need rollback for mem_section before failed mem_section. Make rollback end point to failed mem_section to remove unnecessary rollback. As pfn += PAGES_PER_SECTION will be executed even if init_section_page_ext failed. So pfn points to mem_section after failed mem_section. Subtract one mem_section from pfn to get failed mem_section. Link: https://lkml.kernel.org/r/20230714114749.1743032-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index 096451df1c87..f052397dc70f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -424,6 +424,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, return 0; /* rollback */ + end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); -- cgit v1.2.3 From eb0da7f6e0832a689d845ca2d62152dc6b43e780 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:49 +0800 Subject: mm/page_ext: move functions around for minor cleanups to page_ext 1. move page_ext_get and page_ext_put down to remove forward declaration of lookup_page_ext. 2. move page_ext_init_flatmem_late down to existing non SPARS block to remove a new non SPARS block and to keep code for non SPARS tight. Link: https://lkml.kernel.org/r/20230714114749.1743032-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 96 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/mm/page_ext.c b/mm/page_ext.c index f052397dc70f..4548fcc66d74 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -90,7 +90,6 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size; static unsigned long total_usage; -static struct page_ext *lookup_page_ext(const struct page *page); bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) @@ -137,62 +136,16 @@ static void __init invoke_init_callbacks(void) } } -#ifndef CONFIG_SPARSEMEM -void __init page_ext_init_flatmem_late(void) -{ - invoke_init_callbacks(); -} -#endif - static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -/** - * page_ext_get() - Get the extended information for a page. - * @page: The page we're interested in. - * - * Ensures that the page_ext will remain valid until page_ext_put() - * is called. - * - * Return: NULL if no page_ext exists for this page. - * Context: Any context. Caller may not sleep until they have called - * page_ext_put(). - */ -struct page_ext *page_ext_get(struct page *page) -{ - struct page_ext *page_ext; - - rcu_read_lock(); - page_ext = lookup_page_ext(page); - if (!page_ext) { - rcu_read_unlock(); - return NULL; - } - - return page_ext; -} - -/** - * page_ext_put() - Working with page extended information is done. - * @page_ext: Page extended information received from page_ext_get(). - * - * The page extended information of the page may not be valid after this - * function is called. - * - * Return: None. - * Context: Any context with corresponding page_ext_get() is called. - */ -void page_ext_put(struct page_ext *page_ext) +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) { - if (unlikely(!page_ext)) - return; - - rcu_read_unlock(); + invoke_init_callbacks(); } -#ifndef CONFIG_SPARSEMEM - void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { @@ -536,3 +489,46 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) } #endif + +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext: Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} -- cgit v1.2.3 From 7613366a190202a8ebe8090ca4758b551f1b7feb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:05 +0800 Subject: mm/ioremap: define generic_ioremap_prot() and generic_iounmap() Define a generic version of ioremap_prot() and iounmap() that architectures can call after they have performed the necessary alteration to parameters and/or necessary verifications. Link: https://lkml.kernel.org/r/20230706154520.11257-5-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 4 ++++ mm/ioremap.c | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 587e7e9b9a37..a7ca2099ba19 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1073,9 +1073,13 @@ static inline bool iounmap_allowed(void *addr) } #endif +void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t prot); + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot); void iounmap(volatile void __iomem *addr); +void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { diff --git a/mm/ioremap.c b/mm/ioremap.c index 8652426282cc..db6234b9db59 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -11,8 +11,8 @@ #include #include -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) +void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t prot) { unsigned long offset, vaddr; phys_addr_t last_addr; @@ -28,7 +28,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr -= offset; size = PAGE_ALIGN(size + offset); - if (!ioremap_allowed(phys_addr, size, prot)) + if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) return NULL; area = get_vm_area_caller(size, VM_IOREMAP, @@ -38,17 +38,22 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, vaddr = (unsigned long)area->addr; area->phys_addr = phys_addr; - if (ioremap_page_range(vaddr, vaddr + size, phys_addr, - __pgprot(prot))) { + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { free_vm_area(area); return NULL; } return (void __iomem *)(vaddr + offset); } + +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) +{ + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); +} EXPORT_SYMBOL(ioremap_prot); -void iounmap(volatile void __iomem *addr) +void generic_iounmap(volatile void __iomem *addr) { void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); @@ -58,4 +63,9 @@ void iounmap(volatile void __iomem *addr) if (is_vmalloc_addr(vaddr)) vunmap(vaddr); } + +void iounmap(volatile void __iomem *addr) +{ + generic_iounmap(addr); +} EXPORT_SYMBOL(iounmap); -- cgit v1.2.3 From dfdc6ba95768b4935058dcf2a8a09874289ba88f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:06 +0800 Subject: mm: ioremap: allow ARCH to have its own ioremap method definition Architectures can be converted to GENERIC_IOREMAP, to take standard ioremap_xxx() and iounmap() way. But some ARCH-es could have specific handling for ioremap_prot(), ioremap() and iounmap(), than standard methods. In oder to convert these ARCH-es to take GENERIC_IOREMAP method, allow these architecutres to have their own ioremap_prot(), ioremap() and iounmap() definitions. Link: https://lkml.kernel.org/r/20230706154520.11257-6-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 3 +++ mm/ioremap.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'mm') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a7ca2099ba19..39244c3ee797 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1081,11 +1081,14 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); +#ifndef ioremap +#define ioremap ioremap static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ return ioremap_prot(addr, size, _PAGE_IOREMAP); } +#endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ #ifndef ioremap_wc diff --git a/mm/ioremap.c b/mm/ioremap.c index db6234b9db59..9f34a8f90b58 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -46,12 +46,14 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, return (void __iomem *)(vaddr + offset); } +#ifndef ioremap_prot void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); +#endif void generic_iounmap(volatile void __iomem *addr) { @@ -64,8 +66,10 @@ void generic_iounmap(volatile void __iomem *addr) vunmap(vaddr); } +#ifndef iounmap void iounmap(volatile void __iomem *addr) { generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); +#endif -- cgit v1.2.3 From a5f6164831104a7441a0c4101c5b74cd59fbdfa6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:07 +0800 Subject: mm/ioremap: add slab availability checking in ioremap_prot Several architectures has done checking if slab if available in ioremap_prot(). In fact it should be done in generic ioremap_prot() since on any architecutre, slab allocator must be available before get_vm_area_caller() and vunmap() are used. Add the checking into generic_ioremap_prot(). Link: https://lkml.kernel.org/r/20230706154520.11257-7-bhe@redhat.com Suggested-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/ioremap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/ioremap.c b/mm/ioremap.c index 9f34a8f90b58..86b82ec27d2b 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -18,6 +18,10 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr_t last_addr; struct vm_struct *area; + /* An early platform driver might end up here */ + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + /* Disallow wrap-around or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) -- cgit v1.2.3 From ab1cd02083d046a25b48a2cad71ace6d5ddf0e9e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:16 +0800 Subject: mm/ioremap: consider IOREMAP space in generic ioremap Architectures like powerpc have a dedicated space for IOREMAP mappings. If so, use it in generic_ioremap_prot(). Link: https://lkml.kernel.org/r/20230706154520.11257-16-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/ioremap.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/ioremap.c b/mm/ioremap.c index 86b82ec27d2b..68d9895144ad 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -11,6 +11,15 @@ #include #include +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif + void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) { @@ -35,8 +44,8 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) return NULL; - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(0)); + area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, + IOREMAP_END, __builtin_return_address(0)); if (!area) return NULL; vaddr = (unsigned long)area->addr; @@ -66,7 +75,7 @@ void generic_iounmap(volatile void __iomem *addr) if (!iounmap_allowed(vaddr)) return; - if (is_vmalloc_addr(vaddr)) + if (is_ioremap_addr(vaddr)) vunmap(vaddr); } -- cgit v1.2.3 From 016fec91013cfb27c9f5a101a87ae8266537fed1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:17 +0800 Subject: mm: move is_ioremap_addr() into new header file Now is_ioremap_addr() is only used in kernel/iomem.c and gonna be used in mm/ioremap.c. Move it into its own new header file linux/ioremap.h. Link: https://lkml.kernel.org/r/20230706154520.11257-17-bhe@redhat.com Suggested-by: Christoph Hellwig Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 10 ---------- include/linux/ioremap.h | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 5 ----- kernel/iomem.c | 1 + mm/ioremap.c | 10 +--------- 5 files changed, 32 insertions(+), 24 deletions(-) create mode 100644 include/linux/ioremap.h (limited to 'mm') diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6a88bfdaa69b..445a22987aa3 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,16 +157,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -#ifdef CONFIG_PPC64 -#define is_ioremap_addr is_ioremap_addr -static inline bool is_ioremap_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= IOREMAP_BASE && addr < IOREMAP_END; -} -#endif /* CONFIG_PPC64 */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h new file mode 100644 index 000000000000..f0e99fc7dd8b --- /dev/null +++ b/include/linux/ioremap.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IOREMAP_H +#define _LINUX_IOREMAP_H + +#include +#include + +#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif +static inline bool is_ioremap_addr(const void *x) +{ + unsigned long addr = (unsigned long)kasan_reset_tag(x); + + return addr >= IOREMAP_START && addr < IOREMAP_END; +} +#else +static inline bool is_ioremap_addr(const void *x) +{ + return false; +} +#endif + +#endif /* _LINUX_IOREMAP_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bfb46483108c..0ae5654f665b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1055,11 +1055,6 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifndef is_ioremap_addr -#define is_ioremap_addr(x) is_vmalloc_addr(x) -#endif - #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); diff --git a/kernel/iomem.c b/kernel/iomem.c index 62c92e43aa0d..9682471e6471 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -3,6 +3,7 @@ #include #include #include +#include #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ diff --git a/mm/ioremap.c b/mm/ioremap.c index 68d9895144ad..a21a6c9fa5ab 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -10,15 +10,7 @@ #include #include #include - -/* - * Ioremap often, but not always uses the generic vmalloc area. E.g on - * Power ARCH, it could have different ioremap space. - */ -#ifndef IOREMAP_START -#define IOREMAP_START VMALLOC_START -#define IOREMAP_END VMALLOC_END -#endif +#include void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) -- cgit v1.2.3 From 95da27c4c6dd735c4f2798aca9095c086cf48faf Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:20 +0800 Subject: mm: ioremap: remove unneeded ioremap_allowed and iounmap_allowed Now there are no users of ioremap_allowed and iounmap_allowed, clean them up. Link: https://lkml.kernel.org/r/20230706154520.11257-20-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 26 -------------------------- mm/ioremap.c | 6 ------ 2 files changed, 32 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 39244c3ee797..bac63e874c7b 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1047,32 +1047,6 @@ static inline void iounmap(volatile void __iomem *addr) #elif defined(CONFIG_GENERIC_IOREMAP) #include -/* - * Arch code can implement the following two hooks when using GENERIC_IOREMAP - * ioremap_allowed() return a bool, - * - true means continue to remap - * - false means skip remap and return directly - * iounmap_allowed() return a bool, - * - true means continue to vunmap - * - false means skip vunmap and return directly - */ -#ifndef ioremap_allowed -#define ioremap_allowed ioremap_allowed -static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size, - unsigned long prot) -{ - return true; -} -#endif - -#ifndef iounmap_allowed -#define iounmap_allowed iounmap_allowed -static inline bool iounmap_allowed(void *addr) -{ - return true; -} -#endif - void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); diff --git a/mm/ioremap.c b/mm/ioremap.c index a21a6c9fa5ab..3e049dfb28bd 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -33,9 +33,6 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr -= offset; size = PAGE_ALIGN(size + offset); - if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) - return NULL; - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, __builtin_return_address(0)); if (!area) @@ -64,9 +61,6 @@ void generic_iounmap(volatile void __iomem *addr) { void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); - if (!iounmap_allowed(vaddr)) - return; - if (is_ioremap_addr(vaddr)) vunmap(vaddr); } -- cgit v1.2.3 From 65c8d30e679bdffeeaa0b84b7094a3c719aa6585 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 17 Jul 2023 21:10:01 +0800 Subject: mm/tlbbatch: introduce arch_tlbbatch_should_defer() Patch series "arm64: support batched/deferred tlb shootdown during page reclamation/migration", v11. Though ARM64 has the hardware to do tlb shootdown, the hardware broadcasting is not free. A simplest micro benchmark shows even on snapdragon 888 with only 8 cores, the overhead for ptep_clear_flush is huge even for paging out one page mapped by only one process: 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush While pages are mapped by multiple processes or HW has more CPUs, the cost should become even higher due to the bad scalability of tlb shootdown. The same benchmark can result in 16.99% CPU consumption on ARM64 server with around 100 cores according to the test on patch 4/4. This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by 1. only send tlbi instructions in the first stage - arch_tlbbatch_add_mm() 2. wait for the completion of tlbi by dsb while doing tlbbatch sync in arch_tlbbatch_flush() Testing on snapdragon shows the overhead of ptep_clear_flush is removed by the patchset. The micro benchmark becomes 5% faster even for one page mapped by single process on snapdragon 888. Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset does some renaming/extension for the current implementation first (Patch 1-3), then add the support on arm64 (Patch 4). This patch (of 4): The entire scheme of deferred TLB flush in reclaim path rests on the fact that the cost to refill TLB entries is less than flushing out individual entries by sending IPI to remote CPUs. But architecture can have different ways to evaluate that. Hence apart from checking TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be architecture specific. [yangyicong@hisilicon.com: rebase and fix incorrect return value type] Link: https://lkml.kernel.org/r/20230717131004.12662-1-yangyicong@huawei.com Link: https://lkml.kernel.org/r/20230717131004.12662-2-yangyicong@huawei.com Signed-off-by: Anshuman Khandual [https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khandual@linux.vnet.ibm.com/] Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Anshuman Khandual Reviewed-by: Barry Song Reviewed-by: Xin Hao Tested-by: Punit Agrawal Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Cc: Barry Song Cc: Mel Gorman Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 12 ++++++++++++ mm/rmap.c | 9 +-------- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385..cf2a1de5d388 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -253,6 +253,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* diff --git a/mm/rmap.c b/mm/rmap.c index 2668f5ea3534..7a479e22d288 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -688,17 +688,10 @@ retry: */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { - bool should_defer = false; - if (!(flags & TTU_BATCH_FLUSH)) return false; - /* If remote CPUs need to be flushed then defer batch the flush */ - if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) - should_defer = true; - put_cpu(); - - return should_defer; + return arch_tlbbatch_should_defer(mm); } /* -- cgit v1.2.3 From f73419bb89d606de9be2043febf0957d56627a5b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:02 +0800 Subject: mm/tlbbatch: rename and extend some functions This patch does some preparation works to extend batched TLB flush to arm64. Including: - Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm() to accept an additional argument for address, architectures like arm64 may need this for tlbi. - Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() to match its current function since we don't need to handle mm on architectures like arm64 and add_mm is not proper, add_pending will make sense to both as on x86 we're pending the TLB flush operations while on arm64 we're pending the synchronize operations. This intends no functional changes on x86. Link: https://lkml.kernel.org/r/20230717131004.12662-3-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++-- include/linux/mm_types_task.h | 4 ++-- mm/rmap.c | 12 +++++++----- 3 files changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf2a1de5d388..1c7d3a36e16c 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -276,8 +276,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 5414b5c6a103..aa44fff8bb9d 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -52,8 +52,8 @@ struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * The arch code makes the following promise: generic code can modify a - * PTE, then call arch_tlbbatch_add_mm() (which internally provides all - * needed barriers), then call arch_tlbbatch_flush(), and the entries + * PTE, then call arch_tlbbatch_add_pending() (which internally provides + * all needed barriers), then call arch_tlbbatch_flush(), and the entries * will be flushed on all CPUs by the time that arch_tlbbatch_flush() * returns. */ diff --git a/mm/rmap.c b/mm/rmap.c index 7a479e22d288..f6fb821d56a8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; @@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) if (!pte_accessible(mm, pteval)) return; - arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); + arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); tlb_ubc->flush_required = true; /* @@ -726,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm) } } #else -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { } @@ -1579,7 +1581,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } @@ -1962,7 +1964,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } -- cgit v1.2.3 From db6c1f6f236dbcd271d51d37675bbccfcea7c7be Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Jul 2023 21:10:03 +0800 Subject: mm/tlbbatch: introduce arch_flush_tlb_batched_pending() Currently we'll flush the mm in flush_tlb_batched_pending() to avoid race between reclaim unmaps pages by batched TLB flush and mprotect/munmap/etc. Other architectures like arm64 may only need a synchronization barrier(dsb) here rather than a full mm flush. So add arch_flush_tlb_batched_pending() to allow an arch-specific implementation here. This intends no functional changes on x86 since still a full mm flush for x86. Link: https://lkml.kernel.org/r/20230717131004.12662-4-yangyicong@huawei.com Signed-off-by: Yicong Yang Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Kefeng Wang Cc: lipeifeng Cc: Mark Rutland Cc: Mel Gorman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Xin Hao Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++++ mm/rmap.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1c7d3a36e16c..837e4a50281a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -284,6 +284,11 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, diff --git a/mm/rmap.c b/mm/rmap.c index f6fb821d56a8..5717517e4040 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -717,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { - flush_tlb_mm(mm); + arch_flush_tlb_batched_pending(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. -- cgit v1.2.3 From 58f341f772bb48b3d7b13dd6c0f9705ebdd02592 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 17 Jul 2023 19:36:44 +0800 Subject: mm/memcg: minor cleanup for mc_handle_present_pte() When pagetable lock is held, the page will always be page_mapped(). So remove unneeded page_mapped() check. Also the page can't be freed from under us in this case. So use get_page() to get extra page reference to simplify the code. No functional change intended. Link: https://lkml.kernel.org/r/20230717113644.3026478-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93e3cc581b51..51772df1abc5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5639,7 +5639,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, { struct page *page = vm_normal_page(vma, addr, ptent); - if (!page || !page_mapped(page)) + if (!page) return NULL; if (PageAnon(page)) { if (!(mc.flags & MOVE_ANON)) @@ -5648,8 +5648,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, if (!(mc.flags & MOVE_FILE)) return NULL; } - if (!get_page_unless_zero(page)) - return NULL; + get_page(page); return page; } -- cgit v1.2.3 From 0792e47d566244e150e320708e0be708a9db1a93 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 17 Jul 2023 06:58:11 +0000 Subject: mm/mm_init.c: drop node_start_pfn from adjust_zone_range_for_zone_movable() node_start_pfn is not used in adjust_zone_range_for_zone_movable(), so it is pointless to waste a function argument. Drop the parameter. Link: https://lkml.kernel.org/r/20230717065811.1262-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 2daae1dd5755..7e3fcdbe997b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1105,7 +1105,6 @@ void __ref memmap_init_zone_device(struct zone *zone, */ static void __init adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, - unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, unsigned long *zone_end_pfn) @@ -1222,9 +1221,8 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, /* Get the start and end of the zone */ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); - adjust_zone_range_for_zone_movable(nid, zone_type, - node_start_pfn, node_end_pfn, - zone_start_pfn, zone_end_pfn); + adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn, + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) -- cgit v1.2.3 From 8d3a7d797c1a18a24f602a7398103ee894c5bc3b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 15 Jul 2023 11:51:11 +0800 Subject: memory tier: use helper macro __ATTR_RW() Use helper macro __ATTR_RW to define numa demotion attributes. Minor readability improvement. Link: https://lkml.kernel.org/r/20230715035111.2656784-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c49ab03f49b1..37a4f59d9585 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -672,16 +672,16 @@ bool numa_demotion_enabled = false; #ifdef CONFIG_MIGRATION #ifdef CONFIG_SYSFS -static ssize_t numa_demotion_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", numa_demotion_enabled ? "true" : "false"); } -static ssize_t numa_demotion_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { ssize_t ret; @@ -693,8 +693,7 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj, } static struct kobj_attribute numa_demotion_enabled_attr = - __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, - numa_demotion_enabled_store); + __ATTR_RW(demotion_enabled); static struct attribute *numa_attrs[] = { &numa_demotion_enabled_attr.attr, -- cgit v1.2.3 From cabdf74e6b319c989eb8e812f1854291ae0af1c0 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 18 Jul 2023 15:30:19 +0800 Subject: mm: kfence: allocate kfence_metadata at runtime kfence_metadata is currently a static array. For the purpose of allocating scalable __kfence_pool, we first change it to runtime allocation of metadata. Since the size of an object of kfence_metadata is 1160 bytes, we can save at least 72 pages (with default 256 objects) without enabling kfence. [akpm@linux-foundation.org: restore newline, per Marco] Link: https://lkml.kernel.org/r/20230718073019.52513-1-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/kfence.h | 11 +++-- mm/kfence/core.c | 123 ++++++++++++++++++++++++++++++++++--------------- mm/kfence/kfence.h | 5 +- mm/mm_init.c | 2 +- 4 files changed, 97 insertions(+), 44 deletions(-) (limited to 'mm') diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 726857a4b680..401af4757514 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr) } /** - * kfence_alloc_pool() - allocate the KFENCE pool via memblock + * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE + * metadata via memblock */ -void __init kfence_alloc_pool(void); +void __init kfence_alloc_pool_and_metadata(void); /** * kfence_init() - perform KFENCE initialization at boot time * - * Requires that kfence_alloc_pool() was called before. This sets up the - * allocation gate timer, and requires that workqueues are available. + * Requires that kfence_alloc_pool_and_metadata() was called before. This sets + * up the allocation gate timer, and requires that workqueues are available. */ void __init kfence_init(void); @@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } -static inline void kfence_alloc_pool(void) { } +static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } static inline void kfence_shutdown_cache(struct kmem_cache *s) { } static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index dad3c0eb70a0..96fd0411f5c5 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -116,7 +116,15 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ * backing pages (in __kfence_pool). */ static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); -struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +struct kfence_metadata *kfence_metadata __read_mostly; + +/* + * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). + * So introduce kfence_metadata_init to initialize metadata, and then make + * kfence_metadata visible after initialization is successful. This prevents + * potential UAF or access to uninitialized metadata. + */ +static struct kfence_metadata *kfence_metadata_init __read_mostly; /* Freelist with available objects. */ static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); @@ -591,7 +599,7 @@ static unsigned long kfence_init_pool(void) __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG - slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | + slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg | MEMCG_DATA_OBJCGS; #endif } @@ -610,7 +618,7 @@ static unsigned long kfence_init_pool(void) } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { - struct kfence_metadata *meta = &kfence_metadata[i]; + struct kfence_metadata *meta = &kfence_metadata_init[i]; /* Initialize metadata. */ INIT_LIST_HEAD(&meta->list); @@ -626,6 +634,12 @@ static unsigned long kfence_init_pool(void) addr += 2 * PAGE_SIZE; } + /* + * Make kfence_metadata visible only when initialization is successful. + * Otherwise, if the initialization fails and kfence_metadata is freed, + * it may cause UAF in kfence_shutdown_cache(). + */ + smp_store_release(&kfence_metadata, kfence_metadata_init); return 0; reset_slab: @@ -672,26 +686,10 @@ static bool __init kfence_init_pool_early(void) */ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; - return false; -} - -static bool kfence_init_pool_late(void) -{ - unsigned long addr, free_size; - addr = kfence_init_pool(); - - if (!addr) - return true; + memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); + kfence_metadata_init = NULL; - /* Same as above. */ - free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); -#ifdef CONFIG_CONTIG_ALLOC - free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); -#else - free_pages_exact((void *)addr, free_size); -#endif - __kfence_pool = NULL; return false; } @@ -841,19 +839,30 @@ static void toggle_allocation_gate(struct work_struct *work) /* === Public interface ===================================================== */ -void __init kfence_alloc_pool(void) +void __init kfence_alloc_pool_and_metadata(void) { if (!kfence_sample_interval) return; - /* if the pool has already been initialized by arch, skip the below. */ - if (__kfence_pool) - return; - - __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); - + /* + * If the pool has already been initialized by arch, there is no need to + * re-allocate the memory pool. + */ if (!__kfence_pool) + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) { pr_err("failed to allocate pool\n"); + return; + } + + /* The memory allocated by memblock has been zeroed out. */ + kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); + if (!kfence_metadata_init) { + pr_err("failed to allocate metadata\n"); + memblock_free(__kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = NULL; + } } static void kfence_init_enable(void) @@ -895,33 +904,69 @@ void __init kfence_init(void) static int kfence_init_late(void) { - const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE; + const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; + const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; + unsigned long addr = (unsigned long)__kfence_pool; + unsigned long free_size = KFENCE_POOL_SIZE; + int err = -ENOMEM; + #ifdef CONFIG_CONTIG_ALLOC struct page *pages; - pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL); + pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, + NULL); if (!pages) return -ENOMEM; + __kfence_pool = page_to_virt(pages); + pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, + NULL); + if (pages) + kfence_metadata_init = page_to_virt(pages); #else - if (nr_pages > MAX_ORDER_NR_PAGES) { + if (nr_pages_pool > MAX_ORDER_NR_PAGES || + nr_pages_meta > MAX_ORDER_NR_PAGES) { pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); return -EINVAL; } + __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); if (!__kfence_pool) return -ENOMEM; + + kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); #endif - if (!kfence_init_pool_late()) { - pr_err("%s failed\n", __func__); - return -EBUSY; + if (!kfence_metadata_init) + goto free_pool; + + memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); + addr = kfence_init_pool(); + if (!addr) { + kfence_init_enable(); + kfence_debugfs_init(); + return 0; } - kfence_init_enable(); - kfence_debugfs_init(); + pr_err("%s failed\n", __func__); + free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); + err = -EBUSY; - return 0; +#ifdef CONFIG_CONTIG_ALLOC + free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), + nr_pages_meta); +free_pool: + free_contig_range(page_to_pfn(virt_to_page((void *)addr)), + free_size / PAGE_SIZE); +#else + free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); +free_pool: + free_pages_exact((void *)addr, free_size); +#endif + + kfence_metadata_init = NULL; + __kfence_pool = NULL; + return err; } static int kfence_enable_late(void) @@ -941,6 +986,10 @@ void kfence_shutdown_cache(struct kmem_cache *s) struct kfence_metadata *meta; int i; + /* Pairs with release in kfence_init_pool(). */ + if (!smp_load_acquire(&kfence_metadata)) + return; + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { bool in_use; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 392fb273e7bd..f46fbb03062b 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -102,7 +102,10 @@ struct kfence_metadata { #endif }; -extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \ + CONFIG_KFENCE_NUM_OBJECTS) + +extern struct kfence_metadata *kfence_metadata; static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 7e3fcdbe997b..acb0ac194672 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2767,7 +2767,7 @@ void __init mm_core_init(void) */ page_ext_init_flatmem(); mem_debugging_and_hardening_init(); - kfence_alloc_pool(); + kfence_alloc_pool_and_metadata(); report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); -- cgit v1.2.3 From 89be82b4fed258b63a201d92fca95e7c55913c23 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 17:21:36 +0800 Subject: mm/rmap: correct stale comment of rmap_walk_anon and rmap_walk_file 1. update page to folio in comment 2. add comment of new added @locked Link: https://lkml.kernel.org/r/20230718092136.1935789-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 5717517e4040..1355bf686fae 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2397,11 +2397,12 @@ out: /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method - * @page: the page to be handled + * @folio: the folio to be handled * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. + * Find all the mappings of a folio using the mapping pointer and the vma + * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) @@ -2445,10 +2446,11 @@ static void rmap_walk_anon(struct folio *folio, /* * rmap_walk_file - do something to file page using the object-based rmap method - * @page: the page to be handled + * @folio: the folio to be handled * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock * - * Find all the mappings of a page using the mapping pointer and the vma chains + * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, -- cgit v1.2.3 From affd26b1fbd67fceea70d9ceac40ff4815afbeb5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 19 Jul 2023 11:41:45 -0700 Subject: mm/hugetlb: get rid of page_hstate() Convert the last page_hstate() user to use folio_hstate() so page_hstate() can be safely removed. Link: https://lkml.kernel.org/r/20230719184145.301911-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ---------- mm/hugetlb.c | 6 +++--- mm/page_isolation.c | 8 ++++---- 3 files changed, 7 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f4bac3df59e..0a393bc02f25 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -841,11 +841,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return size_to_hstate(folio_size(folio)); } -static inline struct hstate *page_hstate(struct page *page) -{ - return folio_hstate(page_folio(page)); -} - static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; @@ -1062,11 +1057,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return NULL; } -static inline struct hstate *page_hstate(struct page *page) -{ - return NULL; -} - static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b076eb07a29..412a3eec081c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1785,10 +1785,10 @@ static void free_hpage_workfn(struct work_struct *work) node = node->next; page->mapping = NULL; /* - * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() - * is going to trigger because a previous call to + * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in + * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will call folio_set_compound_dtor - * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate() * directly. */ h = size_to_hstate(page_size(page)); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6599cc965e21..bcf99ba747a0 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -79,17 +79,17 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * handle each tail page individually in migration. */ if (PageHuge(page) || PageTransCompound(page)) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); unsigned int skip_pages; if (PageHuge(page)) { - if (!hugepage_migration_supported(page_hstate(head))) + if (!hugepage_migration_supported(folio_hstate(folio))) return page; - } else if (!PageLRU(head) && !__PageMovable(head)) { + } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { return page; } - skip_pages = compound_nr(head) - (page - head); + skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page); pfn += skip_pages - 1; continue; } -- cgit v1.2.3 From 2574d5e4df32a7eed7176665a5ea08eadb704d70 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:48 -0400 Subject: mm/mmap: clean up validate_mm() calls Patch series "More strict maple tree lockdep", v2. Linus asked for more strict maple tree lockdep checking [1] and for them to resume the normal path through Andrews tree. This series of patches adds checks to ensure the lock is held in write mode during the write path of the maple tree instead of checking if it's held at all. It also reduces the validate_mm() calls by consolidating into commonly used functions (patch 0001), and removes the necessity of holding the lock on the detached tree during munmap() operations. This patch (of 4): validate_mm() calls are too spread out and duplicated in numerous locations. Also, now that the stack write is done under the write lock, it is not necessary to validate the mm prior to write operations. Add a validate_mm() to the stack expansions, and to vma_complete() so that numerous others may be dropped. Note that vma_link() (and also insert_vm_struct() by call path) already call validate_mm(). vma_merge() also had an unnecessary call to vma_iter_free() since the logic change to abort earlier if no merging is necessary. Drop extra validate_mm() calls at the start of functions and error paths which won't write to the tree. Relocate the validate_mm() call in the do_brk_flags() to avoid re-running the same test when vma_complete() is used. The call within the error path of mmap_region() is left intentionally because of the complexity of the function and the potential of drivers modifying the tree. Link: https://lkml.kernel.org/r/20230714195551.894800-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230714195551.894800-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- mm/mmap.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index e1586b2f938e..cdbeff089049 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -594,6 +594,7 @@ again: } if (vp->insert && vp->file) uprobe_mmap(vp->insert); + validate_mm(mm); } /* @@ -676,7 +677,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; nomem: @@ -716,7 +716,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_end = end; vma->vm_pgoff = pgoff; vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; } @@ -889,7 +888,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, pgoff_t pglen = (end - addr) >> PAGE_SHIFT; long adj_start = 0; - validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1016,10 +1014,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } vma_complete(&vp, vmi, mm); - vma_iter_free(vmi); - validate_mm(mm); khugepaged_enter_vma(res, vm_flags); - return res; } @@ -1194,7 +1189,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; - validate_mm(mm); *populate = 0; if (!len) @@ -2023,6 +2017,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); + validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2113,6 +2108,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); + validate_mm(mm); return error; } @@ -2290,7 +2286,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) remove_vma(vma, false); } vm_unacct_memory(nr_accounted); - validate_mm(mm); } /* @@ -2327,8 +2322,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm(vma->vm_mm); - WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2385,7 +2378,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2394,7 +2386,6 @@ out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm(vma->vm_mm); return err; } @@ -3045,7 +3036,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3097,6 +3087,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; + validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); @@ -3105,7 +3096,6 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); - validate_mm(mm); return 0; mas_store_fail: @@ -3286,7 +3276,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3345,7 +3334,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm(mm); return new_vma; out_vma_link: @@ -3361,7 +3349,6 @@ out_free_mempol: out_free_vma: vm_area_free(new_vma); out: - validate_mm(mm); return NULL; } @@ -3498,7 +3485,6 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3521,12 +3507,10 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm(mm); return ERR_PTR(ret); } -- cgit v1.2.3 From 02fdb25fb41c563a3afbd4a97469c527a6c5abbf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 5 Jul 2023 14:47:49 -0400 Subject: mm/mmap: change detached vma locking scheme Don't set the lock to the mm lock so that the detached VMA tree does not complain about being unlocked when the mmap_lock is dropped prior to freeing the tree. Introduce mt_on_stack() for setting the external lock to NULL only when LOCKDEP is used. Move the destroying of the detached tree outside the mmap lock all together. Link: https://lkml.kernel.org/r/20230719183142.ktgcmuj2pnlr3h3s@revolver Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 +++ mm/mmap.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6618c1512886..e278b9598428 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -187,10 +187,13 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map + +#define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) +#define mt_on_stack(mt) do { } while (0) #endif /* diff --git a/mm/mmap.c b/mm/mmap.c index cdbeff089049..7bd1caa09ddd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2428,7 +2428,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long locked_vm = 0; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_set_external_lock(&mt_detach, &mm->mmap_lock); + mt_on_stack(mt_detach); /* * If we need to split any vma, do it now to save pain later. @@ -2546,11 +2546,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); - __mt_destroy(&mt_detach); validate_mm(mm); if (unlock) mmap_read_unlock(mm); + __mt_destroy(&mt_detach); return 0; clear_tree_failed: -- cgit v1.2.3 From 57b037dbbadc0d8de44bf06a62ad2d9265ef31d3 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:04 +1000 Subject: mmu_notifiers: fixup comment in mmu_interval_read_begin() The comment in mmu_interval_read_begin() refers to a function that doesn't exist and uses the wrong call-back name. The op for mmu interval notifiers is mmu_interval_notifier_ops->invalidate() so fix the comment up to reflect that. Link: https://lkml.kernel.org/r/e7a09081b3ac82a03c189409f1262fc2df91071e.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- mm/mmu_notifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 50c0dde1354f..b7ad1559c72f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -199,7 +199,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) * invalidate_start/end and is colliding. * * The locking looks broadly like this: - * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * mn_itree_inv_start(): mmu_interval_read_begin(): * spin_lock * seq = READ_ONCE(interval_sub->invalidate_seq); * seq == subs->invalidate_seq @@ -207,7 +207,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) * spin_lock * seq = ++subscriptions->invalidate_seq * spin_unlock - * op->invalidate_range(): + * op->invalidate(): * user_lock * mmu_interval_set_seq() * interval_sub->invalidate_seq = seq -- cgit v1.2.3 From ec8832d007cb7b50229ad5745eec35b847cc9120 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:06 +1000 Subject: mmu_notifiers: don't invalidate secondary TLBs as part of mmu_notifier_invalidate_range_end() Secondary TLBs are now invalidated from the architecture specific TLB invalidation functions. Therefore there is no need to explicitly notify or invalidate as part of the range end functions. This means we can remove mmu_notifier_invalidate_range_end_only() and some of the ptep_*_notify() functions. Link: https://lkml.kernel.org/r/90d749d03cbab256ca0edeb5287069599566d783.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 56 ++------------------------------------------ kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 25 ++++---------------- mm/hugetlb.c | 1 - mm/memory.c | 8 ++----- mm/migrate_device.c | 9 ++----- mm/mmu_notifier.c | 25 +++----------------- mm/rmap.c | 40 ------------------------------- 8 files changed, 14 insertions(+), 152 deletions(-) (limited to 'mm') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 64a3e051c3c4..f2e9edc6aa43 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -395,8 +395,7 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, - bool only_end); +extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool @@ -481,14 +480,7 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) might_sleep(); if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, false); -} - -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ - if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, true); + __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -582,45 +574,6 @@ static inline void mmu_notifier_range_init_owner( __young; \ }) -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - unsigned long ___addr = __address & PAGE_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pte_t ___pte; \ - \ - ___pte = ptep_clear_flush(__vma, __address, __ptep); \ - mmu_notifier_invalidate_range(___mm, ___addr, \ - ___addr + PAGE_SIZE); \ - \ - ___pte; \ -}) - -#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - -#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pud_t ___pud; \ - \ - ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PUD_SIZE); \ - \ - ___pud; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -711,11 +664,6 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ -} - static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0ac5b874919..3048589e2e85 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); - ptep_clear_flush_notify(vma, addr, pvmw.pte); + ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 762be2f4244c..3ece117de898 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2003,7 +2003,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, count_vm_event(THP_SPLIT_PUD); - pudp_huge_clear_flush_notify(vma, haddr, pud); + pudp_huge_clear_flush(vma, haddr, pud); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, @@ -2023,11 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pudp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2094,7 +2090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { - old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it @@ -2304,20 +2300,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, out: spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback. - * They are 3 cases to consider inside __split_huge_pmd_locked(): - * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious - * 2) __split_huge_zero_page_pmd() read only zero page and any write - * fault will trigger a flush_notify before pointing to a new page - * (it is fine if the secondary mmu keeps pointing to the old zero - * page in the meantime) - * 3) Split a huge pmd into pte pointing to the same page. No need - * to invalidate secondary tlb entry they are all still valid. - * any further changes to individual pte will notify. So no need - * to call mmu_notifier->invalidate_range() - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 412a3eec081c..4672752b0b17 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5688,7 +5688,6 @@ retry_avoidcopy: /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); - mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(&old_folio->page, vma, true); hugepage_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) diff --git a/mm/memory.c b/mm/memory.c index 44d11812a88f..3e16f0637376 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3155,7 +3155,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ - ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); /* @@ -3201,11 +3201,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); } - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e29626e1329e..6c556b5876c6 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -658,7 +658,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); - ptep_clear_flush_notify(vma, addr, ptep); + ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); } else { @@ -763,13 +763,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() - * did already call it. - */ if (notified) - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } /** diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index b7ad1559c72f..453a156d93c0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -551,7 +551,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) static void mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, - struct mmu_notifier_range *range, bool only_end) + struct mmu_notifier_range *range) { struct mmu_notifier *subscription; int id; @@ -559,24 +559,6 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - /* - * Call invalidate_range here too to avoid the need for the - * subsystem of having to register an invalidate_range_end - * call-back when there is invalidate_range already. Usually a - * subsystem registers either invalidate_range_start()/end() or - * invalidate_range(), so this will be no additional overhead - * (besides the pointer check). - * - * We skip call to invalidate_range() if we know it is safe ie - * call site use mmu_notifier_invalidate_range_only_end() which - * is safe to do when we know that a call to invalidate_range() - * already happen under page table lock. - */ - if (!only_end && subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, - range->mm, - range->start, - range->end); if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) non_block_start(); @@ -589,8 +571,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, - bool only_end) +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { struct mmu_notifier_subscriptions *subscriptions = range->mm->notifier_subscriptions; @@ -600,7 +581,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, mn_itree_inv_end(subscriptions); if (!hlist_empty(&subscriptions->list)) - mn_hlist_invalidate_end(subscriptions, range, only_end); + mn_hlist_invalidate_end(subscriptions, range); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } diff --git a/mm/rmap.c b/mm/rmap.c index 1355bf686fae..51ec8aa5e61f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -985,13 +985,6 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) #endif } - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/mm/mmu_notifier.rst - */ if (ret) cleaned++; } @@ -1549,8 +1542,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, - range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map @@ -1623,9 +1614,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else if (folio_test_anon(folio)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1637,9 +1625,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, folio_test_swapcache(folio))) { WARN_ON_ONCE(1); ret = false; - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); page_vma_mapped_walk_done(&pvmw); break; } @@ -1670,9 +1655,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ if (ref_count == 1 + map_count && !folio_test_dirty(folio)) { - /* Invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, - address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1727,9 +1709,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); - /* Invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else { /* * This is a locked file-backed folio, @@ -1745,13 +1724,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter_file(&folio->page)); } discard: - /* - * No need to call mmu_notifier_invalidate_range() it has be - * done above for all cases requiring it to happen under page - * table lock before mmu_notifier_invalidate_range_end() - * - * See Documentation/mm/mmu_notifier.rst - */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); @@ -1930,8 +1902,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, - range.start, range.end); /* * The ref count of the PMD page was @@ -2036,9 +2006,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else { swp_entry_t entry; pte_t swp_pte; @@ -2102,13 +2069,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ } - /* - * No need to call mmu_notifier_invalidate_range() it has be - * done above for all cases requiring it to happen under page - * table lock before mmu_notifier_invalidate_range_end() - * - * See Documentation/mm/mmu_notifier.rst - */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); -- cgit v1.2.3 From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 6 ++-- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++--- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 2 +- drivers/iommu/amd/iommu_v2.c | 10 +++--- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 13 +++---- drivers/iommu/intel/svm.c | 8 ++--- drivers/misc/ocxl/link.c | 8 ++--- include/linux/mmu_notifier.h | 48 ++++++++++++------------- mm/huge_memory.c | 4 +-- mm/hugetlb.c | 7 ++-- mm/mmu_notifier.c | 21 ++++++++--- 13 files changed, 76 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a99349d10c0e..84a05a0bd2b6 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -253,7 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -265,7 +265,7 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); - mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } @@ -400,7 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index f3fb49fd32fe..17075c78d4bc 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,7 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d44902a4962..06e647ef19d1 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,7 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1021,7 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1230,7 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1395,7 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0a5432364c5a..6ab42caaa67a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -283,7 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93b2f81f09da..2d253919b3e8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1037,7 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 261352a23271..2596466cd5a6 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -355,9 +355,9 @@ static struct pasid_state *mn_to_state(struct mmu_notifier *mn) return container_of(mn, struct pasid_state, mn); } -static void mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pasid_state *pasid_state; struct device_state *dev_state; @@ -391,8 +391,8 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .invalidate_range = mn_invalidate_range, + .release = mn_release, + .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, }; static void set_pri_tag_status(struct pasid_state *pasid_state, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2a19784b698e..dbc812a0e57e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -186,9 +186,10 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) } } -static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); struct arm_smmu_domain *smmu_domain = smmu_mn->domain; @@ -247,9 +248,9 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { - .invalidate_range = arm_smmu_mm_invalidate_range, - .release = arm_smmu_mm_release, - .free_notifier = arm_smmu_mmu_notifier_free, + .arch_invalidate_secondary_tlbs = arm_smmu_mm_arch_invalidate_secondary_tlbs, + .release = arm_smmu_mm_release, + .free_notifier = arm_smmu_mmu_notifier_free, }; /* Allocate or get existing MMU notifier for this {domain, mm} pair */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e95b339e9cdc..8f6d68006ab6 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -219,9 +219,9 @@ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, } /* Pages have been freed at this point */ -static void intel_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); @@ -256,7 +256,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, - .invalidate_range = intel_invalidate_range, + .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, }; static DEFINE_MUTEX(pasid_mutex); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 4cf4c55a5f00..c06c699c0e7b 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -491,9 +491,9 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) } EXPORT_SYMBOL_GPL(ocxl_link_release); -static void invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); struct ocxl_link *link = pe_data->link; @@ -509,7 +509,7 @@ static void invalidate_range(struct mmu_notifier *mn, } static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { - .invalidate_range = invalidate_range, + .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs, }; static u64 calculate_cfg_state(bool kernel) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f2e9edc6aa43..6e3c857606f1 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -187,27 +187,27 @@ struct mmu_notifier_ops { const struct mmu_notifier_range *range); /* - * invalidate_range() is either called between - * invalidate_range_start() and invalidate_range_end() when the - * VM has to free pages that where unmapped, but before the - * pages are actually freed, or outside of _start()/_end() when - * a (remote) TLB is necessary. + * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB + * which shares page-tables with the CPU. The + * invalidate_range_start()/end() callbacks should not be implemented as + * invalidate_secondary_tlbs() already catches the points in time when + * an external TLB needs to be flushed. * - * If invalidate_range() is used to manage a non-CPU TLB with - * shared page-tables, it not necessary to implement the - * invalidate_range_start()/end() notifiers, as - * invalidate_range() already catches the points in time when an - * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/mm/mmu_notifier.rst + * This requires arch_invalidate_secondary_tlbs() to be called while + * holding the ptl spin-lock and therefore this callback is not allowed + * to sleep. * - * Note that this function might be called with just a sub-range - * of what was passed to invalidate_range_start()/end(), if - * called between those functions. + * This is called by architecture code whenever invalidating a TLB + * entry. It is assumed that any secondary TLB has the same rules for + * when invalidations are required. If this is not the case architecture + * code will need to call this explicitly when required for secondary + * TLB invalidation. */ - void (*invalidate_range)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + void (*arch_invalidate_secondary_tlbs)( + struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -396,8 +396,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); @@ -483,11 +483,11 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) __mmu_notifier_invalidate_range_end(range); } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range(mm, start, end); + __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) @@ -664,7 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ece117de898..e0420de0e2e0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2120,8 +2120,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_invalidate_range() see comments below inside - * __split_huge_pmd() ? + * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below + * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4672752b0b17..5ef7bccda50c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6649,8 +6649,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, else flush_hugetlb_tlb_range(vma, start, end); /* - * No need to call mmu_notifier_invalidate_range() we are downgrading - * page table protection not changing it to point to a new page. + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are + * downgrading page table protection not changing it to point to a new + * page. * * See Documentation/mm/mmu_notifier.rst */ @@ -7294,7 +7295,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* - * No need to call mmu_notifier_invalidate_range(), see + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 453a156d93c0..ec3b068cbbe6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -585,8 +585,8 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) lock_map_release(&__mmu_notifier_invalidate_range_start_map); } -void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; int id; @@ -595,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, hlist_for_each_entry_rcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - if (subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, mm, - start, end); + if (subscription->ops->arch_invalidate_secondary_tlbs) + subscription->ops->arch_invalidate_secondary_tlbs( + subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -616,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Subsystems should only register for invalidate_secondary_tlbs() or + * invalidate_range_start()/end() callbacks, not both. + */ + if (WARN_ON_ONCE(subscription && + (subscription->ops->arch_invalidate_secondary_tlbs && + (subscription->ops->invalidate_range_start || + subscription->ops->invalidate_range_end)))) + return -EINVAL; + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we -- cgit v1.2.3 From eafcb7a972e2039e0f8a8c0d4fd3e90e485c8b9c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 23 Jul 2023 11:31:14 +0800 Subject: mm/mprotect: fix obsolete function name in change_pte_range() Since commit 79a1971c5f14 ("mm: move the copy_one_pte() pte_present check into the caller"), the explanation of preserving soft-dirtiness is moved into copy_nonpresent_pte(). Update corresponding comment. Link: https://lkml.kernel.org/r/20230723033114.3224409-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 5c3112d92466..3f36c88a238e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -213,7 +213,7 @@ static long change_pte_range(struct mmu_gather *tlb, } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See - * copy_one_pte() for explanation. + * copy_nonpresent_pte() for explanation. */ entry = make_readable_device_private_entry( swp_offset(entry)); -- cgit v1.2.3 From e6bd14eca207bf822b7c743818ba6e04889348ec Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 21 Jul 2023 23:09:56 +0800 Subject: mm/compaction: correct comment of candidate pfn in fast_isolate_freepages Patch series "Two minor cleanups for compaction", v2. This series contains two random cleanups for compaction. This patch (of 2): If no preferred one was not found, we will use candidate page with maximum pfn > min_pfn which is saved in high_pfn. Correct "minimum" to "maximum candidate" in comment. Link: https://lkml.kernel.org/r/20230721150957.2058634-1-shikemeng@huawei.com Link: https://lkml.kernel.org/r/20230721150957.2058634-2-shikemeng@huawei.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 02239abed6ce..480f29bcf823 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1554,7 +1554,7 @@ static void fast_isolate_freepages(struct compact_control *cc) break; } - /* Use a minimum pfn if a preferred one was not found */ + /* Use a maximum candidate pfn if a preferred one was not found */ if (!page && high_pfn) { page = pfn_to_page(high_pfn); -- cgit v1.2.3 From 3c099a2b0b53d98552cd69d19fd76049bcbafe38 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 21 Jul 2023 23:09:57 +0800 Subject: mm/compaction: avoid unneeded pageblock_end_pfn when no_set_skip_hint is set Move pageblock_end_pfn after no_set_skip_hint check to avoid unneeded pageblock_end_pfn if no_set_skip_hint is set. Link: https://lkml.kernel.org/r/20230721150957.2058634-3-shikemeng@huawei.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 480f29bcf823..e6ac0ef4c178 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -463,12 +463,12 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { struct zone *zone = cc->zone; - pfn = pageblock_end_pfn(pfn); - /* Set for isolation rather than compaction */ if (cc->no_set_skip_hint) return; + pfn = pageblock_end_pfn(pfn); + if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && -- cgit v1.2.3 From 479c33049116f2d138b4dfec328961881cc26b33 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:42 +0800 Subject: mm/page_io: remove unneeded ClearPageUptodate() Patch series "Convert several functions in page_io.c to use a folio", v4. Convert several functions in page_io.c to use a folio, which can remove several implicit calls to compound_head(). This patch (of 10): The VM_BUG_ON_FOLIO in swap_readpage() ensures that the page is already !uptodate in __end_swap_bio_read() and sio_read_complete(). Just remove unneeded ClearPageUptodate(). Link: https://lkml.kernel.org/r/20230721034451.16412-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230721034451.16412-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 684cd3c7b59b..4283aeeae0b7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -62,7 +62,6 @@ static void __end_swap_bio_read(struct bio *bio) if (bio->bi_status) { SetPageError(page); - ClearPageUptodate(page); pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); @@ -417,7 +416,6 @@ static void sio_read_complete(struct kiocb *iocb, long ret) struct page *page = sio->bvec[p].bv_page; SetPageError(page); - ClearPageUptodate(page); unlock_page(page); } pr_alert_ratelimited("Read-error on swap-device\n"); -- cgit v1.2.3 From 9962ed64bd2154863ab3b63b15a2b55e39dc7117 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:43 +0800 Subject: mm/page_io: remove unneeded SetPageError() Nobody checks the PageError()/folio_test_error() for the page/folio in __end_swap_bio_read/write() and sio_write_complete(). Therefore, we don't need to set the error flag. Just drop it. Link: https://lkml.kernel.org/r/20230721034451.16412-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 4283aeeae0b7..a34f2cd608f7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -32,7 +32,6 @@ static void __end_swap_bio_write(struct bio *bio) struct page *page = bio_first_page_all(bio); if (bio->bi_status) { - SetPageError(page); /* * We failed to write the page out to swap-space. * Re-dirty the page in order to avoid it being reclaimed. @@ -61,7 +60,6 @@ static void __end_swap_bio_read(struct bio *bio) struct page *page = bio_first_page_all(bio); if (bio->bi_status) { - SetPageError(page); pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); @@ -415,7 +413,6 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct page *page = sio->bvec[p].bv_page; - SetPageError(page); unlock_page(page); } pr_alert_ratelimited("Read-error on swap-device\n"); -- cgit v1.2.3 From a3ed1e9b63a2703caab4fe63ddb560991a5f618c Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:45 +0800 Subject: mm/page_io: use a folio in __end_swap_bio_write() Saves two implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-5-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index a34f2cd608f7..f2cc5ebb89b2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -29,7 +29,7 @@ static void __end_swap_bio_write(struct bio *bio) { - struct page *page = bio_first_page_all(bio); + struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { /* @@ -40,13 +40,13 @@ static void __end_swap_bio_write(struct bio *bio) * * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ - set_page_dirty(page); + folio_mark_dirty(folio); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); - ClearPageReclaim(page); + folio_clear_reclaim(folio); } - end_page_writeback(page); + folio_end_writeback(folio); } static void end_swap_bio_write(struct bio *bio) -- cgit v1.2.3 From bc74b53f29e1025a08e97f8e507968608a567f26 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:46 +0800 Subject: mm/page_io: use a folio in __end_swap_bio_read() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-6-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index f2cc5ebb89b2..6faaaa9a6c3b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -57,16 +57,16 @@ static void end_swap_bio_write(struct bio *bio) static void __end_swap_bio_read(struct bio *bio) { - struct page *page = bio_first_page_all(bio); + struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } else { - SetPageUptodate(page); + folio_mark_uptodate(folio); } - unlock_page(page); + folio_unlock(folio); } static void end_swap_bio_read(struct bio *bio) -- cgit v1.2.3 From 6a8c068774ad7634b43bebd97182141765398835 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:47 +0800 Subject: mm/page_io: use a folio in sio_read_complete() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-7-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 6faaaa9a6c3b..979d9ce7d9cd 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -403,17 +403,17 @@ static void sio_read_complete(struct kiocb *iocb, long ret) if (ret == sio->len) { for (p = 0; p < sio->pages; p++) { - struct page *page = sio->bvec[p].bv_page; + struct folio *folio = page_folio(sio->bvec[p].bv_page); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } count_vm_events(PSWPIN, sio->pages); } else { for (p = 0; p < sio->pages; p++) { - struct page *page = sio->bvec[p].bv_page; + struct folio *folio = page_folio(sio->bvec[p].bv_page); - unlock_page(page); + folio_unlock(folio); } pr_alert_ratelimited("Read-error on swap-device\n"); } -- cgit v1.2.3 From f54fcaabd34b98921ec12501d0507e1fa1ae831b Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:48 +0800 Subject: mm/page_io: use a folio in swap_writepage_bdev_sync() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-8-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 979d9ce7d9cd..fc6e1a41744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -331,6 +331,7 @@ static void swap_writepage_bdev_sync(struct page *page, { struct bio_vec bv; struct bio bio; + struct folio *folio = page_folio(page); bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); @@ -340,8 +341,8 @@ static void swap_writepage_bdev_sync(struct page *page, bio_associate_blkg_from_page(&bio, page); count_swpout_vm_event(page); - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); submit_bio_wait(&bio); __end_swap_bio_write(&bio); -- cgit v1.2.3 From 2675251d5037c308a03f8ad1545b4169522cb950 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:49 +0800 Subject: mm/page_io: use a folio in swap_writepage_bdev_async() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-9-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index fc6e1a41744c..886addfabc70 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -352,6 +352,7 @@ static void swap_writepage_bdev_async(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; + struct folio *folio = page_folio(page); bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), @@ -362,8 +363,8 @@ static void swap_writepage_bdev_async(struct page *page, bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); submit_bio(bio); } -- cgit v1.2.3 From 9b72b134eedc6fbdf7b59c9c4764a57d14b2fea7 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:50 +0800 Subject: mm/page_io: convert count_swpout_vm_event() to take in a folio Convert count_swpout_vm_event() to take in a folio. We can remove five implicit calls to compound_head() by taking in a folio. Link: https://lkml.kernel.org/r/20230721034451.16412-10-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 886addfabc70..8b3845f44331 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -205,13 +205,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) return 0; } -static inline void count_swpout_vm_event(struct page *page) +static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (unlikely(PageTransHuge(page))) + if (unlikely(folio_test_pmd_mappable(folio))) count_vm_event(THP_SWPOUT); #endif - count_vm_events(PSWPOUT, thp_nr_pages(page)); + count_vm_events(PSWPOUT, folio_nr_pages(folio)); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) @@ -280,7 +280,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) } } else { for (p = 0; p < sio->pages; p++) - count_swpout_vm_event(sio->bvec[p].bv_page); + count_swpout_vm_event(page_folio(sio->bvec[p].bv_page)); } for (p = 0; p < sio->pages; p++) @@ -339,7 +339,7 @@ static void swap_writepage_bdev_sync(struct page *page, __bio_add_page(&bio, page, thp_size(page), 0); bio_associate_blkg_from_page(&bio, page); - count_swpout_vm_event(page); + count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); @@ -362,7 +362,7 @@ static void swap_writepage_bdev_async(struct page *page, __bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); - count_swpout_vm_event(page); + count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio(bio); -- cgit v1.2.3 From 98630cfdc4221e1455e13c1bd423d029c888dca6 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:51 +0800 Subject: mm/page_io: convert bio_associate_blkg_from_page() to take in a folio Convert bio_associate_blkg_from_page() to take in a folio. We can remove two implicit calls to compound_head() by taking in a folio. Link: https://lkml.kernel.org/r/20230721034451.16412-11-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index 8b3845f44331..ff4156a44d5d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -215,12 +215,12 @@ static inline void count_swpout_vm_event(struct folio *folio) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) { struct cgroup_subsys_state *css; struct mem_cgroup *memcg; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return; @@ -230,7 +230,7 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) rcu_read_unlock(); } #else -#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#define bio_associate_blkg_from_page(bio, folio) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ struct swap_iocb { @@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page, bio.bi_iter.bi_sector = swap_page_sector(page); __bio_add_page(&bio, page, thp_size(page), 0); - bio_associate_blkg_from_page(&bio, page); + bio_associate_blkg_from_page(&bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); @@ -361,7 +361,7 @@ static void swap_writepage_bdev_async(struct page *page, bio->bi_end_io = end_swap_bio_write; __bio_add_page(bio, page, thp_size(page), 0); - bio_associate_blkg_from_page(bio, page); + bio_associate_blkg_from_page(bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); -- cgit v1.2.3 From 90717566f8f6b6761494ccfff43ea62af443fc9b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 20 Jul 2023 21:34:36 +0200 Subject: mm: don't drop VMA locks in mm_drop_all_locks() Despite its name, mm_drop_all_locks() does not drop _all_ locks; the mmap lock is held write-locked by the caller, and the caller is responsible for dropping the mmap lock at a later point (which will also release the VMA locks). Calling vma_end_write_all() here is dangerous because the caller might have write-locked a VMA with the expectation that it will stay write-locked until the mmap_lock is released, as usual. This _almost_ becomes a problem in the following scenario: An anonymous VMA A and an SGX VMA B are mapped adjacent to each other. Userspace calls munmap() on a range starting at the start address of A and ending in the middle of B. Hypothetical call graph with additional notes in brackets: do_vmi_align_munmap [begin first for_each_vma_range loop] vma_start_write [on VMA A] vma_mark_detached [on VMA A] __split_vma [on VMA B] sgx_vma_open [== new->vm_ops->open] sgx_encl_mm_add __mmu_notifier_register [luckily THIS CAN'T ACTUALLY HAPPEN] mm_take_all_locks mm_drop_all_locks vma_end_write_all [drops VMA lock taken on VMA A before] vma_start_write [on VMA B] vma_mark_detached [on VMA B] [end first for_each_vma_range loop] vma_iter_clear_gfp [removes VMAs from maple tree] mmap_write_downgrade unmap_region mmap_read_unlock In this hypothetical scenario, while do_vmi_align_munmap() thinks it still holds a VMA write lock on VMA A, the VMA write lock has actually been invalidated inside __split_vma(). The call from sgx_encl_mm_add() to __mmu_notifier_register() can't actually happen here, as far as I understand, because we are duplicating an existing SGX VMA, but sgx_encl_mm_add() only calls __mmu_notifier_register() for the first SGX VMA created in a given process. So this could only happen in fork(), not on munmap(). But in my view it is just pure luck that this can't happen. Also, we wouldn't actually have any bad consequences from this in do_vmi_align_munmap(), because by the time the bug drops the lock on VMA A, we've already marked VMA A as detached, which makes it completely ineligible for any VMA-locked page faults. But again, that's just pure luck. So remove the vma_end_write_all(), so that VMA write locks are only ever released on mmap_write_unlock() or mmap_write_downgrade(). Also add comments to document the locking rules established by this patch. Link: https://lkml.kernel.org/r/20230720193436.454247-1-jannh@google.com Fixes: eeff9a5d47f8 ("mm/mmap: prevent pagefault handler from racing with mmu_notifier registration") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/mmap_lock.h | 8 ++++++++ mm/mmap.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 84988d4ff8fb..93eb291181f7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) return (vma->vm_lock_seq == *mm_lock_seq); } +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ static inline void vma_start_write(struct vm_area_struct *vma) { int mm_lock_seq; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a5c63b6d7d46..8d38dcb6d044 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -73,6 +73,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7bd1caa09ddd..4a9466b76648 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3642,6 +3642,12 @@ int mm_take_all_locks(struct mm_struct *mm) mutex_lock(&mm_all_locks_mutex); + /* + * vma_start_write() does not have a complement in mm_drop_all_locks() + * because vma_start_write() is always asymmetrical; it marks a VMA as + * being written to until mmap_write_unlock() or mmap_write_downgrade() + * is reached. + */ mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; @@ -3738,7 +3744,6 @@ void mm_drop_all_locks(struct mm_struct *mm) if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } - vma_end_write_all(mm); mutex_unlock(&mm_all_locks_mutex); } -- cgit v1.2.3 From fd892593d44d8b649caf30a67f0c7696d976d901 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:45 -0400 Subject: mm: change do_vmi_align_munmap() tracking of VMAs to remove The majority of the calls to munmap a vm range is within a single vma. The maple tree is able to store a single entry at 0, with a size of 1 as a pointer and avoid any allocations. Change do_vmi_align_munmap() to store the VMAs being munmap()'ed into a tree indexed by the count. This will leverage the ability to store the first entry without a node allocation. Storing the entries into a tree by the count and not the vma start and end means changing the functions which iterate over the entries. Update unmap_vmas() and free_pgtables() to take a maple state and a tree end address to support this functionality. Passing through the same maple state to unmap_vmas() and free_pgtables() means the state needs to be reset between calls. This happens in the static unmap_region() and exit_mmap(). Link: https://lkml.kernel.org/r/20230724183157.3939892-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- mm/internal.h | 2 +- mm/memory.c | 16 +++++++--------- mm/mmap.c | 41 ++++++++++++++++++++++++----------------- 4 files changed, 34 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 93eb291181f7..ded514ee2588 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2287,9 +2287,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/mm/internal.h b/mm/internal.h index 483add0bfb28..7d11ebe5d11c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -109,7 +109,7 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, +void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); diff --git a/mm/memory.c b/mm/memory.c index 3e16f0637376..ed4807deec89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -361,12 +361,10 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, +void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -375,7 +373,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ - next = mas_find(&mas, ceiling - 1); + next = mas_find(mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -396,7 +394,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = mas_find(&mas, ceiling - 1); + next = mas_find(mas, ceiling - 1); if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1713,9 +1711,10 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end, + bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1723,7 +1722,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, /* Careful - we need to zap private pages too! */ .even_cows = true, }; - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); @@ -1731,7 +1729,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details, mm_wr_locked); - } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); + } while ((vma = mas_find(mas, tree_end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 4a9466b76648..5212a0b66b8f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -76,10 +76,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, +static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -2293,18 +2293,20 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, +static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, - unsigned long start, unsigned long end, bool mm_wr_locked) + struct vm_area_struct *next, unsigned long start, + unsigned long end, unsigned long tree_end, bool mm_wr_locked) { struct mmu_gather tlb; + unsigned long mt_start = mas->index; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); - free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); + mas_set(mas, mt_start); + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, mm_wr_locked); tlb_finish_mmu(&tlb); @@ -2472,7 +2474,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, goto end_split_failed; } vma_start_write(next); - mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + mas_set(&mas_detach, count); error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; @@ -2511,17 +2513,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { - MA_STATE(test, &mt_detach, start, end - 1); + MA_STATE(test, &mt_detach, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vmi, start); rcu_read_lock(); - vma_test = mas_find(&test, end - 1); + vma_test = mas_find(&test, count - 1); for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; - vma_test = mas_next(&test, end - 1); + vma_test = mas_next(&test, count - 1); } rcu_read_unlock(); BUG_ON(count != test_count); @@ -2542,9 +2544,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ - unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock); + mas_set(&mas_detach, 1); + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, + !unlock); /* Statistics and freeing VMAs */ - mas_set(&mas_detach, start); + mas_set(&mas_detach, 0); remove_mt(mm, &mas_detach); validate_mm(mm); if (unlock) @@ -2864,9 +2868,10 @@ unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; + vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, - vma->vm_end, true); + unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, + vma->vm_end, vma->vm_end, true); } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); @@ -3185,7 +3190,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); + unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* @@ -3195,7 +3200,8 @@ void exit_mmap(struct mm_struct *mm) set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); - free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + mas_set(&mas, vma->vm_end); + free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); @@ -3204,6 +3210,7 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ + mas_set(&mas, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); -- cgit v1.2.3 From 445a2ea0ef0e0e69812218b2c896a23443466625 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:46 -0400 Subject: mm: remove prev check from do_vmi_align_munmap() If the prev does not exist, the vma iterator will be set to MAS_NONE, which will be treated as a MAS_START when the mas_next or mas_find is used. In this case, the next caller will be the vma iterator, which uses mas_find() under the hood and will now do what the user expects. Link: https://lkml.kernel.org/r/20230724183157.3939892-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5212a0b66b8f..5fbc7d71d60c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2459,8 +2459,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } prev = vma_prev(vmi); - if (unlikely((!prev))) - vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as -- cgit v1.2.3 From 53bee98d004fcd7062f2fe056720704e947e6000 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:48 -0400 Subject: mm: remove re-walk from mmap_region() Using vma_iter_set() will reset the tree and cause a re-walk. Use vmi_iter_config() to set the write to a sub-set of the range. Change the file case to also use vmi_iter_config() so that the end is correctly set. Link: https://lkml.kernel.org/r/20230724183157.3939892-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 8 ++++++++ mm/mmap.c | 15 ++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 7d11ebe5d11c..c5ba08f55deb 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1041,6 +1041,14 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +static inline void vma_iter_config(struct vma_iterator *vmi, + unsigned long index, unsigned long last) +{ + MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START && + (vmi->mas.index > index || vmi->mas.last < index)); + __mas_set_range(&vmi->mas, index, last - 1); +} + /* * VMA Iterator functions shared between nommu and mmap */ diff --git a/mm/mmap.c b/mm/mmap.c index 5fbc7d71d60c..a1a59487390e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2676,8 +2676,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, next = vma_next(&vmi); prev = vma_prev(&vmi); - if (vm_flags & VM_SPECIAL) + if (vm_flags & VM_SPECIAL) { + if (prev) + vma_iter_next_range(&vmi); goto cannot_expand; + } /* Attempt to expand an old mapping */ /* Check next */ @@ -2698,6 +2701,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, merge_start = prev->vm_start; vma = prev; vm_pgoff = prev->vm_pgoff; + } else if (prev) { + vma_iter_next_range(&vmi); } @@ -2708,9 +2713,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto expanded; } + if (vma == prev) + vma_iter_set(&vmi, addr); cannot_expand: - if (prev) - vma_iter_next_range(&vmi); /* * Determine the object being mapped and call the appropriate @@ -2723,7 +2728,7 @@ cannot_expand: goto unacct_error; } - vma_iter_set(&vmi, addr); + vma_iter_config(&vmi, addr, end); vma->vm_start = addr; vma->vm_end = end; vm_flags_init(vma, vm_flags); @@ -2750,7 +2755,7 @@ cannot_expand: if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - vma_iter_set(&vmi, addr); + vma_iter_config(&vmi, addr, end); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. -- cgit v1.2.3 From da0892547b101df6e13255b378380d077975368d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:49 -0400 Subject: maple_tree: re-introduce entry to mas_preallocate() arguments The current preallocation strategy is to preallocate the absolute worst-case allocation for a tree modification. The entry (or NULL) is needed to know how many nodes are needed to write to the tree. Start by adding the argument to the mas_preallocate() definition. Link: https://lkml.kernel.org/r/20230724183157.3939892-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 3 ++- mm/internal.h | 2 +- mm/mmap.c | 4 ++-- tools/testing/radix-tree/maple.c | 32 ++++++++++++++++---------------- 5 files changed, 22 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e10db656e31c..c962af188681 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -466,7 +466,7 @@ void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); -int mas_preallocate(struct ma_state *mas, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3b6f8c8dac65..0d7e30c7d999 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5529,11 +5529,12 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state + * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { int ret; diff --git a/mm/internal.h b/mm/internal.h index c5ba08f55deb..65f646c2ccf3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1054,7 +1054,7 @@ static inline void vma_iter_config(struct vma_iterator *vmi, */ static inline int vma_iter_prealloc(struct vma_iterator *vmi) { - return mas_preallocate(&vmi->mas, GFP_KERNEL); + return mas_preallocate(&vmi->mas, NULL, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi, diff --git a/mm/mmap.c b/mm/mmap.c index a1a59487390e..05e051f87273 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1960,7 +1960,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2050,7 +2050,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 9901ae821911..c6c1c5109deb 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35458,7 +35458,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35467,18 +35467,18 @@ static noinline void __init check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35487,26 +35487,26 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35515,12 +35515,12 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35528,21 +35528,21 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35550,14 +35550,14 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35565,7 +35565,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); -- cgit v1.2.3 From f72cf24a868675280bc555e95abc3639777f8d82 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:51 -0400 Subject: mm: use vma_iter_clear_gfp() in nommu Move the definition of vma_iter_clear_gfp() from mmap.c to internal.h so it can be used in the nommu code. This will reduce node preallocations in nommu. Link: https://lkml.kernel.org/r/20230724183157.3939892-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 12 ++++++++++++ mm/mmap.c | 12 ------------ mm/nommu.c | 12 ++++-------- 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 65f646c2ccf3..2f35c0ef7b7f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1064,6 +1064,18 @@ static inline void vma_iter_clear(struct vma_iterator *vmi, mas_store_prealloc(&vmi->mas, NULL); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); diff --git a/mm/mmap.c b/mm/mmap.c index 05e051f87273..1ddb024995df 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -154,18 +154,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, return mas_prev(&vmi->mas, min); } -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - vmi->mas.index = start; - vmi->mas.last = end - 1; - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. diff --git a/mm/nommu.c b/mm/nommu.c index 9826f6101a05..418cc0669c1f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1396,17 +1396,13 @@ static int vmi_shrink_vma(struct vma_iterator *vmi, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (vma_iter_prealloc(vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - if (from > vma->vm_start) { - vma_iter_clear(vmi, from, vma->vm_end); + if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL)) + return -ENOMEM; vma->vm_end = from; } else { - vma_iter_clear(vmi, vma->vm_start, to); + if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL)) + return -ENOMEM; vma->vm_start = to; } -- cgit v1.2.3 From b5df09226450165c434084d346fcb6d4858b0d52 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:52 -0400 Subject: mm: set up vma iterator for vma_iter_prealloc() calls Set the correct limits for vma_iter_prealloc() calls so that the maple tree can be smarter about how many nodes are needed. Link: https://lkml.kernel.org/r/20230724183157.3939892-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- fs/exec.c | 1 + mm/internal.h | 18 ++++++---------- mm/mmap.c | 69 ++++++++++++++++++++++++++++++++++++----------------------- mm/nommu.c | 33 ++++++++++++---------------- 4 files changed, 64 insertions(+), 57 deletions(-) (limited to 'mm') diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba94..0b9484358a49 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) if (vma != vma_next(&vmi)) return -EFAULT; + vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ diff --git a/mm/internal.h b/mm/internal.h index 2f35c0ef7b7f..5a03bc4782a2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1052,23 +1052,21 @@ static inline void vma_iter_config(struct vma_iterator *vmi, /* * VMA Iterator functions shared between nommu and mmap */ -static inline int vma_iter_prealloc(struct vma_iterator *vmi) +static inline int vma_iter_prealloc(struct vma_iterator *vmi, + struct vm_area_struct *vma) { - return mas_preallocate(&vmi->mas, NULL, GFP_KERNEL); + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } -static inline void vma_iter_clear(struct vma_iterator *vmi, - unsigned long start, unsigned long end) +static inline void vma_iter_clear(struct vma_iterator *vmi) { - mas_set_range(&vmi->mas, start, end - 1); mas_store_prealloc(&vmi->mas, NULL); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { - vmi->mas.index = start; - vmi->mas.last = end - 1; + __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; @@ -1105,8 +1103,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); - vmi->mas.index = vma->vm_start; - vmi->mas.last = vma->vm_end - 1; + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } @@ -1117,8 +1114,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); - vmi->mas.index = vma->vm_start; - vmi->mas.last = vma->vm_end - 1; + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; diff --git a/mm/mmap.c b/mm/mmap.c index 1ddb024995df..3f10e708ba72 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -397,7 +397,8 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; - if (vma_iter_prealloc(&vmi)) + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_iter_store(&vmi, vma); @@ -649,19 +650,16 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); - if (vma_iter_prealloc(vmi)) + /* Note: vma iterator must be pointing to 'start' */ + vma_iter_config(vmi, start, end); + if (vma_iter_prealloc(vmi, vma)) goto nomem; vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - /* VMA iterator points to previous, so set to start if necessary */ - if (vma_iter_addr(vmi) != start) - vma_iter_set(vmi, start); - vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - /* Note: mas must be pointing to the expanding VMA */ vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); @@ -687,19 +685,19 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); - if (vma_iter_prealloc(vmi)) + if (vma->vm_start < start) + vma_iter_config(vmi, vma->vm_start, start); + else + vma_iter_config(vmi, end, vma->vm_end); + + if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - if (vma->vm_start < start) - vma_iter_clear(vmi, vma->vm_start, start); - - if (vma->vm_end > end) - vma_iter_clear(vmi, end, vma->vm_end); - + vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -973,7 +971,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (err) return NULL; - if (vma_iter_prealloc(vmi)) + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + if (vma_expanded) { + vma_iter_config(vmi, vma_start, vma_end); + } else { + vma_iter_config(vmi, adjust->vm_start + adj_start, + adjust->vm_end); + } + + if (vma_iter_prealloc(vmi, vma)) return NULL; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); @@ -982,8 +990,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - if (vma_start < vma->vm_start || vma_end > vma->vm_end) - vma_expanded = true; vma->vm_start = vma_start; vma->vm_end = vma_end; @@ -1923,7 +1929,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, vma->vm_start, address); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -1948,6 +1954,10 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } + if (next) + mas_prev_range(&mas, address); + + __mas_set_range(&mas, vma->vm_start, address - 1); if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -1993,7 +2003,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - mas_set_range(&mas, vma->vm_start, address - 1); mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2038,6 +2047,10 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } + if (prev) + mas_next_range(&mas, vma->vm_start); + + __mas_set_range(&mas, address, vma->vm_end - 1); if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -2084,7 +2097,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - mas_set_range(&mas, address, vma->vm_end - 1); mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2325,10 +2337,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) return -ENOMEM; - err = -ENOMEM; - if (vma_iter_prealloc(vmi)) - goto out_free_vma; - if (new_below) { new->vm_end = addr; } else { @@ -2336,6 +2344,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } + err = -ENOMEM; + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, new)) + goto out_free_vma; + err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; @@ -2693,7 +2706,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_next_range(&vmi); } - /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { @@ -2790,7 +2802,7 @@ cannot_expand: goto close_and_free_vma; error = -ENOMEM; - if (vma_iter_prealloc(&vmi)) + if (vma_iter_prealloc(&vmi, vma)) goto close_and_free_vma; /* Lock the VMA since it is modified after insertion into VMA tree */ @@ -3053,7 +3065,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - if (vma_iter_prealloc(vmi)) + vma_iter_config(vmi, vma->vm_start, addr + len); + if (vma_iter_prealloc(vmi, vma)) goto unacct_fail; init_vma_prep(&vp, vma); @@ -3068,6 +3081,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; } + if (vma) + vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) diff --git a/mm/nommu.c b/mm/nommu.c index 418cc0669c1f..1fe0ee239860 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -583,7 +583,8 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (vma_iter_prealloc(&vmi)) { + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -591,7 +592,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); + vma_iter_clear(&vmi); return 0; } /* @@ -1054,9 +1055,6 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (vma_iter_prealloc(&vmi)) - goto error_vma_iter_prealloc; - region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1198,6 +1196,10 @@ unsigned long do_mmap(struct file *file, share: BUG_ON(!vma->vm_region); + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + goto error_just_free; + setup_vma_to_mm(vma, current->mm); current->mm->map_count++; /* add the VMA to the tree */ @@ -1244,14 +1246,6 @@ error_getting_region: len, current->pid); show_mem(); return -ENOMEM; - -error_vma_iter_prealloc: - kmem_cache_free(vm_region_jar, region); - vm_area_free(vma); - pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - show_mem(); - return -ENOMEM; - } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1336,12 +1330,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (vma_iter_prealloc(vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - goto err_vmi_preallocate; - } - /* most fields are the same, copy all, and then fixup */ *region = *vma->vm_region; new->vm_region = region; @@ -1355,6 +1343,13 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, region->vm_pgoff = new->vm_pgoff += npages; } + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, vma)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_vmi_preallocate; + } + if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); -- cgit v1.2.3 From 6935e052557caaa8e1ee0a7d85faeb55853d2e0e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:57 -0400 Subject: mm/mmap: change vma iteration order in do_vmi_align_munmap() By delaying the setting of prev/next VMA until after the write of NULL, the probability of the prev/next VMA already being in the CPU cache is significantly increased, especially for larger munmap operations. It also means that prev/next will be loaded closer to when they are used. This requires changing the loop type when gathering the VMAs that will be freed. Since prev will be set later in the function, it is better to reverse the splitting direction of the start VMA (modify the new_below argument to __split_vma). Using the vma_iter_prev_range() to walk back to the correct location in the tree will, on the most part, mean walking within the CPU cache. Usually, this is two steps vs a node reset and a tree re-walk. Link: https://lkml.kernel.org/r/20230724183157.3939892-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 3f10e708ba72..bc91d91261ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2452,20 +2452,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = __split_vma(vmi, vma, start, 0); + error = __split_vma(vmi, vma, start, 1); if (error) goto start_split_failed; - - vma = vma_iter_load(vmi); } - prev = vma_prev(vmi); - /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - for_each_vma_range(*vmi, next, end) { + next = vma; + do { /* Does it split the end? */ if (next->vm_end > end) { error = __split_vma(vmi, next, end, 0); @@ -2501,13 +2498,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end); #endif - } - - if (vma_iter_end(vmi) > end) - next = vma_iter_load(vmi); - - if (!next) - next = vma_next(vmi); + } for_each_vma_range(*vmi, next, end); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ @@ -2528,7 +2519,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(count != test_count); } #endif - vma_iter_set(vmi, start); + + while (vma_iter_addr(vmi) > start) + vma_iter_prev_range(vmi); + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; @@ -2539,6 +2533,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (unlock) mmap_write_downgrade(mm); + prev = vma_iter_prev_range(vmi); + next = vma_next(vmi); + if (next) + vma_iter_prev_range(vmi); + /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. -- cgit v1.2.3 From 350f6bbca1de515cd7519a33661cefc93ea06054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:02 +0100 Subject: mm: allow per-VMA locks on file-backed VMAs Remove the TCP layering violation by allowing per-VMA locks on all VMAs. The fault path will immediately fail in handle_mm_fault(). There may be a small performance reduction from this patch as a little unnecessary work will be done on each page fault. See later patches for the improvement. Link: https://lkml.kernel.org/r/20230724185410.1124082-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - include/linux/net_mm.h | 17 ----------------- include/net/tcp.h | 1 - mm/memory.c | 10 +++++----- net/ipv4/tcp.c | 11 ++++------- 5 files changed, 9 insertions(+), 31 deletions(-) delete mode 100644 include/linux/net_mm.h (limited to 'mm') diff --git a/MAINTAINERS b/MAINTAINERS index 53b7ca804465..9e4cfcd7998a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14829,7 +14829,6 @@ NETWORKING [TCP] M: Eric Dumazet L: netdev@vger.kernel.org S: Maintained -F: include/linux/net_mm.h F: include/linux/tcp.h F: include/net/tcp.h F: include/trace/events/tcp.h diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h deleted file mode 100644 index b298998bd5a0..000000000000 --- a/include/linux/net_mm.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifdef CONFIG_MMU - -#ifdef CONFIG_INET -extern const struct vm_operations_struct tcp_vm_ops; -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return vma->vm_ops == &tcp_vm_ops; -} -#else -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return false; -} -#endif /* CONFIG_INET*/ - -#endif /* CONFIG_MMU */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ca972ebd3dd..3a818fe1a8a5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -45,7 +45,6 @@ #include #include #include -#include extern struct inet_hashinfo tcp_hashinfo; diff --git a/mm/memory.c b/mm/memory.c index ed4807deec89..b2b17c66f87a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,7 +77,6 @@ #include #include #include -#include #include @@ -5223,6 +5222,11 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -5394,10 +5398,6 @@ retry: if (!vma) goto inval; - /* Only anonymous and tcp vmas are supported for now */ - if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) - goto inval; - if (!vma_start_read(vma)) goto inval; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ed52e1e3c99..b9d49803e77f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1739,7 +1739,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb, } #ifdef CONFIG_MMU -const struct vm_operations_struct tcp_vm_ops = { +static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, @@ -2042,13 +2042,10 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); -#ifdef CONFIG_PER_VMA_LOCK - vma = lock_vma_under_rcu(mm, address); -#endif if (vma) { - if (!vma_is_tcp(vma)) { + if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } @@ -2058,7 +2055,7 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = vma_lookup(mm, address); - if (!vma || !vma_is_tcp(vma)) { + if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } -- cgit v1.2.3 From 4ec31152a80d83d74d231d964703a721236244ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:03 +0100 Subject: mm: move FAULT_FLAG_VMA_LOCK check from handle_mm_fault() Handle a little more of the page fault path outside the mmap sem. The hugetlb path doesn't need to check whether the VMA is anonymous; the VM_HUGETLB flag is only set on hugetlbfs VMAs. There should be no performance change from the previous commit; this is simply a step to ease bisection of any problems. Link: https://lkml.kernel.org/r/20230724185410.1124082-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 6 ++++++ mm/memory.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5ef7bccda50c..26e87d6cc92f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6062,6 +6062,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); + /* TODO: Handle faults under the VMA lock */ + if (flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate diff --git a/mm/memory.c b/mm/memory.c index b2b17c66f87a..2a5f4883d9a5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4984,10 +4984,10 @@ unlock: } /* - * By the time we get here, we already hold the mm semaphore - * - * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __folio_lock_or_retry(). + * On entry, we hold either the VMA lock or the mmap_lock + * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in + * the result, the mmap_lock is not held on exit. See filemap_fault() + * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -5006,6 +5006,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5222,11 +5227,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. -- cgit v1.2.3 From c4fd825e188471d4d2796e02729dd029b3b23210 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:04 +0100 Subject: mm: handle PUD faults under the VMA lock Postpone checking the VMA_LOCK flag until we've attempted to handle faults on PUDs. There's a mild upside to this patch in that we'll allocate the page tables while under the VMA lock rather than the mmap lock, reducing the hold time on the mmap lock, since the retry will find the page tables already populated. The real purpose here is to make a commit that shows we don't call ->huge_fault under the VMA lock. We do now handle setting the accessed bit on a PUD fault under the VMA lock, but that doesn't seem likely to be a measurable difference. Link: https://lkml.kernel.org/r/20230724185410.1124082-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2a5f4883d9a5..29353d552a3f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4859,11 +4859,17 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4872,21 +4878,26 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vma)) goto split; - if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - if (vmf->vma->vm_ops->huge_fault) { - ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ - __split_huge_pud(vmf->vma, vmf->pud, vmf->address); + __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } @@ -5006,11 +5017,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5054,6 +5060,11 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (pmd_none(*vmf.pmd) && hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); -- cgit v1.2.3 From 8f5fd0e1a02020062c52063f15d4e5c426ee3547 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:05 +0100 Subject: mm: handle some PMD faults under the VMA lock Push the VMA_LOCK check down from __handle_mm_fault() to handle_pte_fault(). Once again, we refuse to call ->huge_fault() with the VMA lock held, but we will wait for a PMD migration entry with the VMA lock held, handle NUMA migration and set the accessed bit. We were already doing this for anonymous VMAs, so it should be safe. Link: https://lkml.kernel.org/r/20230724185410.1124082-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 29353d552a3f..932fc6286536 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4821,36 +4821,47 @@ out_map: static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { - if (vma_is_anonymous(vmf->vma)) + struct vm_area_struct *vma = vmf->vma; + if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + } return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; - if (vma_is_anonymous(vmf->vma)) { + if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - if (vmf->vma->vm_ops->huge_fault) { - ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -4921,6 +4932,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may @@ -5060,11 +5076,6 @@ retry_pud: if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - if (pmd_none(*vmf.pmd) && hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); -- cgit v1.2.3 From 0c2e394ab23017303f676e6206a54c54bb0e3681 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:06 +0100 Subject: mm: move FAULT_FLAG_VMA_LOCK check down in handle_pte_fault() Call do_pte_missing() under the VMA lock ... then immediately retry in do_fault(). Link: https://lkml.kernel.org/r/20230724185410.1124082-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 932fc6286536..d947d8d9e891 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4647,6 +4647,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; + if (vmf->flags & FAULT_FLAG_VMA_LOCK){ + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -4932,11 +4937,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may @@ -4969,6 +4969,12 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) return do_pte_missing(vmf); + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + pte_unmap(vmf->pte); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); -- cgit v1.2.3 From 61a4b8d32025dcabcd78994f887a4b9dff912cf0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:07 +0100 Subject: mm: move FAULT_FLAG_VMA_LOCK check down from do_fault() Perform the check at the start of do_read_fault(), do_cow_fault() and do_shared_fault() instead. Should be no performance change from the last commit. Link: https://lkml.kernel.org/r/20230724185410.1124082-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d947d8d9e891..23a20b7a483c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4533,6 +4533,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) vm_fault_t ret = 0; struct folio *folio; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or @@ -4561,6 +4566,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -4601,6 +4611,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) vm_fault_t ret, tmp; struct folio *folio; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -4647,11 +4662,6 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK){ - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ -- cgit v1.2.3 From f5617ffeb450f84c57f7eba1a3524a29955d42b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:08 +0100 Subject: mm: run the fault-around code under the VMA lock The map_pages fs method should be safe to run under the VMA lock instead of the mmap lock. This should have a measurable reduction in contention on the mmap lock. Link: https://lkml.kernel.org/r/20230724185410.1124082-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 23a20b7a483c..52235aa3d665 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4533,11 +4533,6 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) vm_fault_t ret = 0; struct folio *folio; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or @@ -4549,6 +4544,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; -- cgit v1.2.3 From 4c2f803abb1797e571579adcaf134a727b3ffc48 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:09 +0100 Subject: mm: handle swap and NUMA PTE faults under the VMA lock Move the FAULT_FLAG_VMA_LOCK check down in handle_pte_fault(). This is probably not a huge win in its own right, but is a nicely separable bit from the next patch. Link: https://lkml.kernel.org/r/20230724185410.1124082-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 52235aa3d665..c122adce47b4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4979,18 +4979,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) return do_pte_missing(vmf); - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - pte_unmap(vmf->pte); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + pte_unmap(vmf->pte); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { -- cgit v1.2.3 From 063e60d806151f3733acabccb62a463d55fac469 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:10 +0100 Subject: mm: handle faults that merely update the accessed bit under the VMA lock Move FAULT_FLAG_VMA_LOCK check out of handle_pte_fault(). This should have a significant performance improvement for mmaped files. Write faults (on read-only shared pages) still take the mmap lock as we do not want to audit all the implementations of ->pfn_mkwrite() and ->page_mkwrite(). However write-faults on private mappings are handled under the VMA lock. [willy@infradead.org: address "suspicious RCU usage" warning] Link: https://lkml.kernel.org/r/ZMK7jwpI4uD6tKrF@casper.infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c122adce47b4..f06266464208 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3268,6 +3268,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) @@ -3290,6 +3295,12 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + folio_put(folio); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { @@ -3431,6 +3442,12 @@ reuse: return 0; } copy: + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + /* * Ok, we need to copy. Oh, well.. */ @@ -4985,12 +5002,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - pte_unmap(vmf->pte); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { -- cgit v1.2.3 From 348ad1606f4c09e3dc28092baac474e10a252471 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:47 +0530 Subject: mm/hugepage pud: allow arch-specific helper function to check huge page pud support Patch series "Add support for DAX vmemmap optimization for ppc64", v6. This patch series implements changes required to support DAX vmemmap optimization for ppc64. The vmemmap optimization is only enabled with radix MMU translation and 1GB PUD mapping with 64K page size. The patch series also splits the hugetlb vmemmap optimization as a separate Kconfig variable so that architectures can enable DAX vmemmap optimization without enabling hugetlb vmemmap optimization. This should enable architectures like arm64 to enable DAX vmemmap optimization while they can't enable hugetlb vmemmap optimization. More details of the same are in patch "mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization". With 64K page size for 16384 pages added (1G) we save 14 pages With 4K page size for 262144 pages added (1G) we save 4094 pages With 4K page size for 512 pages added (2M) we save 6 pages This patch (of 13): Architectures like powerpc would like to enable transparent huge page pud support only with radix translation. To support that add has_transparent_pud_hugepage() helper that architectures can override. [aneesh.kumar@linux.ibm.com: use the new has_transparent_pud_hugepage()] Link: https://lkml.kernel.org/r/87tttrvtaj.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/nvdimm/pfn_devs.c | 2 +- include/linux/pgtable.h | 3 +++ mm/debug_vm_pgtable.c | 16 +++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index af7d9301520c..18ad315581ca 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) if (has_transparent_hugepage()) { alignments[1] = HPAGE_PMD_SIZE; - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + if (has_transparent_pud_hugepage()) alignments[2] = HPAGE_PUD_SIZE; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f36c055794b..5eb6bdf30c62 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1505,6 +1505,9 @@ typedef unsigned int pgtbl_mod_mask; #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif +#ifndef has_transparent_pud_hugepage +#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ee119e33fef1..844fdfd687b9 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -302,7 +302,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) unsigned long val = idx, *ptr = &val; pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD basic (%pGv)\n", ptr); @@ -343,7 +343,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) unsigned long vaddr = args->vaddr; pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; @@ -405,7 +405,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD leaf\n"); @@ -732,7 +732,7 @@ static void __init pud_devmap_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD devmap\n"); @@ -981,7 +981,7 @@ static void __init pud_thp_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD based THP\n"); @@ -1022,8 +1022,7 @@ static void __init destroy_args(struct pgtable_debug_args *args) /* Free (huge) page */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - has_transparent_hugepage() && + has_transparent_pud_hugepage() && args->pud_pfn != ULONG_MAX) { if (args->is_contiguous_page) { free_contig_range(args->pud_pfn, @@ -1274,8 +1273,7 @@ static int __init init_args(struct pgtable_debug_args *args) * if we fail to allocate (huge) pages. */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - has_transparent_hugepage()) { + has_transparent_pud_hugepage()) { page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_SHIFT - PAGE_SHIFT); if (page) { -- cgit v1.2.3 From f32928ab6fe5abac5a270b6c0bffc4ce77ee8c42 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:48 +0530 Subject: mm: change pudp_huge_get_and_clear_full take vm_area_struct as arg We will use this in a later patch to do tlb flush when clearing pud entries on powerpc. This is similar to commit 93a98695f2f9 ("mm: change pmdp_huge_get_and_clear_full take vm_area_struct as arg") Link: https://lkml.kernel.org/r/20230724190759.483013-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- mm/debug_vm_pgtable.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5eb6bdf30c62..124427ece520 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -456,11 +456,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL -static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { - return pudp_huge_get_and_clear(mm, address, pudp); + return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 844fdfd687b9..d61eaa075c75 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e0420de0e2e0..e371503f7746 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1981,7 +1981,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); + pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); -- cgit v1.2.3 From c1a6c536fb088c01d6bdce77731d89ad5e1734c6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:49 +0530 Subject: mm/vmemmap: improve vmemmap_can_optimize and allow architectures to override dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within vmemmap such that tail page mapping can point to the second PAGE_SIZE area. Enforce that in vmemmap_can_optimize() function. Architectures like powerpc also want to enable vmemmap optimization conditionally (only with radix MMU translation). Hence allow architecture override. Link: https://lkml.kernel.org/r/20230724190759.483013-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +++++++++++++++++++++++---- mm/mm_init.c | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 21299a0cfbca..d4ce73c20dcc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3632,13 +3632,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP -static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - return is_power_of_2(sizeof(struct page)) && - pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* + * For vmemmap optimization with DAX we need minimum 2 vmemmap + * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) diff --git a/mm/mm_init.c b/mm/mm_init.c index acb0ac194672..641c56fd08a2 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, if (!vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); - return 2 * (PAGE_SIZE / sizeof(struct page)); + return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); } static void __ref memmap_init_compound(struct page *head, -- cgit v1.2.3 From 40135fc7188cee4ae64777148fd462f4ea523181 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:50 +0530 Subject: mm/vmemmap: allow architectures to override how vmemmap optimization works Architectures like powerpc will like to use different page table allocators and mapping mechanisms to implement vmemmap optimization. Similar to vmemmap_populate allow architectures to implement vmemap_populate_compound_pages Link: https://lkml.kernel.org/r/20230724190759.483013-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a044a130405b..a2cbe44c48e1 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, return 0; } +#ifndef vmemmap_populate_compound_pages /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail @@ -446,6 +447,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, return 0; } +#endif + struct page * __meminit __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) -- cgit v1.2.3 From 54a948a1e97a9d19a0a4bed63a2d4caef30c5d17 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:52 +0530 Subject: mm/huge pud: use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE pudp_set_wrprotect and move_huge_pud helpers are only used when CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and move_huge_pmd_helpers use architecture override only if CONFIG_TRANSPARENT_HUGEPAGE is set Link: https://lkml.kernel.org/r/20230724190759.483013-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 ++ mm/mremap.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 0af8bc4ce258..f34e0f2cb4d8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -564,6 +564,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { @@ -577,6 +578,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm, { BUILD_BUG(); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif diff --git a/mm/mremap.c b/mm/mremap.c index 11e06e4ab33b..056478c106ee 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, } #endif -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { -- cgit v1.2.3 From 0b6f15824cc7e431a9706c78bfb9cb3011477ad3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:53 +0530 Subject: mm/vmemmap optimization: split hugetlb and devdax vmemmap optimization Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Link: https://lkml.kernel.org/r/20230724190759.483013-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 3 ++- fs/Kconfig | 2 +- include/linux/mm.h | 2 +- mm/Kconfig | 5 ++++- 7 files changed, 11 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e71d5bf2cee0..dc56ddf9ba03 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -60,7 +60,7 @@ config LOONGARCH select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select BUILDTIME_TABLE_SORT select COMMON_CLK diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c86..6943d34c1ec1 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 290b6f93b816..8ff6d1c21e38 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,7 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db409770..78224aa76409 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -128,7 +128,8 @@ config X86 select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 19975b104bc3..f3be721bab6d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -254,7 +254,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ce73c20dcc..b2520dd555f9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3633,7 +3633,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #define VMEMMAP_RESERVE_NR 2 -#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { diff --git a/mm/Kconfig b/mm/Kconfig index 22acffd9009d..1959d048bbf5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -487,7 +487,10 @@ config SPARSEMEM_VMEMMAP # Select this config option from the architecture Kconfig, if it is preferred # to enable the feature of HugeTLB/dev_dax vmemmap optimization. # -config ARCH_WANT_OPTIMIZE_VMEMMAP +config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + bool + +config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool config HAVE_MEMBLOCK_PHYS_MAP -- cgit v1.2.3 From b8cf32dc6e8c75b712cbf638e0fd210101c22f17 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 20 Jun 2023 19:46:44 +0000 Subject: mm: zswap: multiple zpools support Support using multiple zpools of the same type in zswap, for concurrency purposes. A fixed number of 32 zpools is suggested by this commit, which was determined empirically. It can be later changed or made into a config option if needed. On a setup with zswap and zsmalloc, comparing a single zpool to 32 zpools shows improvements in the zsmalloc lock contention, especially on the swap out path. The following shows the perf analysis of the swapout path when 10 workloads are simultaneously reclaiming and refaulting tmpfs pages. There are some improvements on the swap in path as well, but less significant. 1 zpool: |--28.99%--zswap_frontswap_store | | |--8.98%--zpool_map_handle | | | --8.98%--zs_zpool_map | | | --8.95%--zs_map_object | | | --8.38%--_raw_spin_lock | | | --7.39%--queued_spin_lock_slowpath | |--8.82%--zpool_malloc | | | --8.82%--zs_zpool_malloc | | | --8.80%--zs_malloc | | | |--7.21%--_raw_spin_lock | | | | | --6.81%--queued_spin_lock_slowpath 32 zpools: |--16.73%--zswap_frontswap_store | | |--1.81%--zpool_malloc | | | --1.81%--zs_zpool_malloc | | | --1.79%--zs_malloc | | | --0.73%--obj_malloc | |--1.06%--zswap_update_total_size | |--0.59%--zpool_map_handle | | | --0.59%--zs_zpool_map | | | --0.57%--zs_map_object | | | --0.51%--_raw_spin_lock Link: https://lkml.kernel.org/r/20230620194644.3142384-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Yu Zhao Acked-by: Chris Li (Google) Reviewed-by: Nhat Pham Tested-by: Nhat Pham Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Konrad Rzeszutek Wilk Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 81 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 62195f72bf56..258e4e17799a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED( CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); +/* Number of zpools in zswap_pool (empirically determined for scalability) */ +#define ZSWAP_NR_ZPOOLS 32 + /********************************* * data structures **********************************/ @@ -161,7 +164,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpool; + struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; @@ -248,7 +251,7 @@ static bool zswap_has_pool; #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpool)) + zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree); @@ -272,11 +275,13 @@ static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; + int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - total += zpool_get_total_size(pool->zpool); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_size(pool->zpools[i]); rcu_read_unlock(); @@ -365,6 +370,16 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) return false; } +static struct zpool *zswap_find_zpool(struct zswap_entry *entry) +{ + int i = 0; + + if (ZSWAP_NR_ZPOOLS > 1) + i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); + + return entry->pool->zpools[i]; +} + /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. @@ -381,7 +396,7 @@ static void zswap_free_entry(struct zswap_entry *entry) spin_lock(&entry->pool->lru_lock); list_del(&entry->lru); spin_unlock(&entry->pool->lru_lock); - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -590,7 +605,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - if (strcmp(zpool_get_type(pool->zpool), type)) + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) @@ -695,6 +711,7 @@ static void shrink_worker(struct work_struct *w) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { + int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; @@ -715,15 +732,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (!pool) return NULL; - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp); - if (!pool->zpool) { - pr_err("%s zpool not available\n", type); - goto error; + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -755,8 +775,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - if (pool->zpool) - zpool_destroy_pool(pool->zpool); + while (i--) + zpool_destroy_pool(pool->zpools[i]); kfree(pool); return NULL; } @@ -805,11 +825,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { + int i; + zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - zpool_destroy_pool(pool->zpool); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); kfree(pool); } @@ -1073,7 +1096,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct page *page; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - struct zpool *pool = entry->pool->zpool; + struct zpool *pool = zswap_find_zpool(entry); u8 *src, *tmp = NULL; unsigned int dlen; @@ -1214,6 +1237,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct zswap_pool *pool; + struct zpool *zpool; int ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; @@ -1324,10 +1348,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ + zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(entry->pool->zpool)) + if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle); + ret = zpool_malloc(zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; @@ -1336,9 +1361,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); - zpool_unmap_handle(entry->pool->zpool, handle); + zpool_unmap_handle(zpool, handle); mutex_unlock(acomp_ctx->mutex); /* populate entry */ @@ -1409,6 +1434,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src, *dst, *tmp; + struct zpool *zpool; unsigned int dlen; int ret; @@ -1430,7 +1456,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, goto stats; } - if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + zpool = zswap_find_zpool(entry); + if (!zpool_can_sleep_mapped(zpool)) { tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; @@ -1440,12 +1467,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + if (!zpool_can_sleep_mapped(zpool)) { memcpy(tmp, src, entry->length); src = tmp; - zpool_unmap_handle(entry->pool->zpool, entry->handle); + zpool_unmap_handle(zpool, entry->handle); } acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1457,8 +1484,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - if (zpool_can_sleep_mapped(entry->pool->zpool)) - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); else kfree(tmp); @@ -1619,7 +1646,7 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpool)); + zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; } else { -- cgit v1.2.3 From 42c06a0e8ebe95b81e5fb41c6556ff22d9255b0c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 17 Jul 2023 12:02:27 -0400 Subject: mm: kill frontswap The only user of frontswap is zswap, and has been for a long time. Have swap call into zswap directly and remove the indirection. [hannes@cmpxchg.org: remove obsolete comment, per Yosry] Link: https://lkml.kernel.org/r/20230719142832.GA932528@cmpxchg.org [fengwei.yin@intel.com: don't warn if none swapcache folio is passed to zswap_load] Link: https://lkml.kernel.org/r/20230810095652.3905184-1-fengwei.yin@intel.com Link: https://lkml.kernel.org/r/20230717160227.GA867137@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Yin Fengwei Acked-by: Konrad Rzeszutek Wilk Acked-by: Nhat Pham Acked-by: Yosry Ahmed Acked-by: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Matthew Wilcox (Oracle) Cc: Vitaly Wool Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 14 +- Documentation/mm/frontswap.rst | 264 -------------------- Documentation/mm/index.rst | 1 - Documentation/translations/zh_CN/mm/frontswap.rst | 196 --------------- Documentation/translations/zh_CN/mm/index.rst | 1 - MAINTAINERS | 7 - fs/proc/meminfo.c | 1 + include/linux/frontswap.h | 91 ------- include/linux/swap.h | 9 - include/linux/swapfile.h | 5 - include/linux/zswap.h | 37 +++ mm/Kconfig | 4 - mm/Makefile | 1 - mm/frontswap.c | 283 ---------------------- mm/page_io.c | 6 +- mm/swapfile.c | 33 +-- mm/zswap.c | 159 +++++------- 17 files changed, 121 insertions(+), 991 deletions(-) delete mode 100644 Documentation/mm/frontswap.rst delete mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst delete mode 100644 include/linux/frontswap.h create mode 100644 include/linux/zswap.h delete mode 100644 mm/frontswap.c (limited to 'mm') diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index c5c2c7dbb155..45b98390e938 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -49,7 +49,7 @@ compressed pool. Design ====== -Zswap receives pages for compression through the Frontswap API and is able to +Zswap receives pages for compression from the swap subsystem and is able to evict pages from its own compressed pool on an LRU basis and write them back to the backing swap device in the case that the compressed pool is full. @@ -70,19 +70,19 @@ means the compression ratio will always be 2:1 or worse (because of half-full zbud pages). The zsmalloc type zpool has a more complex compressed page storage method, and it can achieve greater storage densities. -When a swap page is passed from frontswap to zswap, zswap maintains a mapping +When a swap page is passed from swapout to zswap, zswap maintains a mapping of the swap entry, a combination of the swap type and swap offset, to the zpool handle that references that compressed swap page. This mapping is achieved with a red-black tree per swap type. The swap offset is the search key for the tree nodes. -During a page fault on a PTE that is a swap entry, frontswap calls the zswap -load function to decompress the page into the page allocated by the page fault -handler. +During a page fault on a PTE that is a swap entry, the swapin code calls the +zswap load function to decompress the page into the page allocated by the page +fault handler. Once there are no PTEs referencing a swap page stored in zswap (i.e. the count -in the swap_map goes to 0) the swap code calls the zswap invalidate function, -via frontswap, to free the compressed entry. +in the swap_map goes to 0) the swap code calls the zswap invalidate function +to free the compressed entry. Zswap seeks to be simple in its policies. Sysfs attributes allow for one user controlled policy: diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst deleted file mode 100644 index c892412988af..000000000000 --- a/Documentation/mm/frontswap.rst +++ /dev/null @@ -1,264 +0,0 @@ -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device". - -Frontswap with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst index 5a94a921ea40..31d2ac306438 100644 --- a/Documentation/mm/index.rst +++ b/Documentation/mm/index.rst @@ -44,7 +44,6 @@ above structured documentation, or deleted if it has served its purpose. balance damon/index free_page_reporting - frontswap hmm hwpoison hugetlbfs_reserv diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst deleted file mode 100644 index 434975390b48..000000000000 --- a/Documentation/translations/zh_CN/mm/frontswap.rst +++ /dev/null @@ -1,196 +0,0 @@ -:Original: Documentation/mm/frontswap.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========= -Frontswap -========= - -Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 -于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 -节省(提高)。 - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 -储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory -(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 -求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 -调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops -的功能,它提供的功能必须符合某些策略,如下所示: - -一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap -交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 -量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 -内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 -transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 -相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 - -一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 -要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 -经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, -也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 -可以像往常一样被写入交换空间。 - -请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” -的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 -不会从frontswap中获得。 - -如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 -debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): - -``failed_stores`` - 有多少次存储的尝试是失败的 - -``loads`` - 尝试了多少次加载(应该全部成功) - -``succ_stores`` - 有多少次存储的尝试是成功的 - -``invalidates`` - 尝试了多少次作废 - -后台实现可以提供额外的指标。 - -经常问到的问题 -============== - -* 价值在哪里? - -当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 -读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 -能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 -移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 -页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 - -Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 -灵活性: - -在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 -全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 -用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 -工作负载上,则有明显的性能改善(25%以上)。 - -“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory -的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 -统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 -交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 -可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 -有多少内存可用 - -在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 -用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 -是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory -后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, -而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 -的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap -允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), -从而减轻计划外交换可能带来的可怕的性能影响。 - -一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 -内存扩展技术的调查也在进行中。 - -* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? - -如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 -个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 -frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 -是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 -后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 -忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 -的,无论如何使用一小部分的 CPU 都是不相关的。 - -至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 -每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 -页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 -会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 -非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 - -当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 -产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 -来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 - -* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? - -我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 -明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 -供了多少内存是完全动态和随机的。 - -每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 -型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 -换页的尝试。 - -每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 -frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 -间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap -backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 -页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 -据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 -下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 -页面偏移量,否则它就会将数据写入该设备。 - -当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), -检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 -的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, -以便从真正的交换设备上获得这一页的数据。 - -所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 -入都被 “frontswap backend store” 和(可能)“frontswap backend loads” -所取代,这可能会快得多。 - -* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 - 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? - -首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 -结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 -假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 -及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 - -例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend -的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 -先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 -以根据当前的内存限制动态地定义。 - -此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 -块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 -不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 -态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 -复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 -的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” -线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, -KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 - -在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 -例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 -自己的交换。 - -transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 -能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, -frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 -容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 -swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap -就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 -backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 - - -* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, - 难道它不能总是被成功地覆盖吗? - -几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 -缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 -backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap -拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 -换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 - -* 为什么frontswap补丁会创建新的头文件swapfile.h? - -frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 -在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 -个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 - -Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst index 2f53e37b8049..b950dd118be7 100644 --- a/Documentation/translations/zh_CN/mm/index.rst +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -42,7 +42,6 @@ Linux内存管理文档 damon/index free_page_reporting ksm - frontswap hmm hwpoison hugetlbfs_reserv diff --git a/MAINTAINERS b/MAINTAINERS index 9e4cfcd7998a..9f0179682d91 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8404,13 +8404,6 @@ F: Documentation/power/freezing-of-tasks.rst F: include/linux/freezer.h F: kernel/freezer.c -FRONTSWAP API -M: Konrad Rzeszutek Wilk -L: linux-kernel@vger.kernel.org -S: Maintained -F: include/linux/frontswap.h -F: mm/frontswap.c - FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS M: David Howells L: linux-cachefs@redhat.com (moderated for non-subscribers) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8dca4d6d96c7..74e3c3815696 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,7 @@ #ifdef CONFIG_CMA #include #endif +#include #include #include "internal.h" diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h deleted file mode 100644 index eaa0ac5f9003..000000000000 --- a/include/linux/frontswap.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FRONTSWAP_H -#define _LINUX_FRONTSWAP_H - -#include -#include -#include -#include - -struct frontswap_ops { - void (*init)(unsigned); /* this swap type was just swapon'ed */ - int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ - int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */ - void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ - void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ -}; - -int frontswap_register_ops(const struct frontswap_ops *ops); - -extern void frontswap_init(unsigned type, unsigned long *map); -extern int __frontswap_store(struct page *page); -extern int __frontswap_load(struct page *page); -extern void __frontswap_invalidate_page(unsigned, pgoff_t); -extern void __frontswap_invalidate_area(unsigned); - -#ifdef CONFIG_FRONTSWAP -extern struct static_key_false frontswap_enabled_key; - -static inline bool frontswap_enabled(void) -{ - return static_branch_unlikely(&frontswap_enabled_key); -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ - p->frontswap_map = map; -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return p->frontswap_map; -} -#else -/* all inline routines become no-ops and all externs are ignored */ - -static inline bool frontswap_enabled(void) -{ - return false; -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return NULL; -} -#endif - -static inline int frontswap_store(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_store(page); - - return -1; -} - -static inline int frontswap_load(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_load(page); - - return -1; -} - -static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) -{ - if (frontswap_enabled()) - __frontswap_invalidate_page(type, offset); -} - -static inline void frontswap_invalidate_area(unsigned type) -{ - if (frontswap_enabled()) - __frontswap_invalidate_area(type); -} - -#endif /* _LINUX_FRONTSWAP_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 456546443f1f..bb5adc604144 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -302,10 +302,6 @@ struct swap_info_struct { struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ -#ifdef CONFIG_FRONTSWAP - unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ - atomic_t frontswap_pages; /* frontswap pages in-use counter */ -#endif spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, @@ -630,11 +626,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif -#ifdef CONFIG_ZSWAP -extern u64 zswap_pool_total_size; -extern atomic_t zswap_stored_pages; -#endif - #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 7ed529a77c5b..99e3ed469e88 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -2,11 +2,6 @@ #ifndef _LINUX_SWAPFILE_H #define _LINUX_SWAPFILE_H -/* - * these were static in swapfile.c but frontswap.c needs them and we don't - * want to expose them to the dozens of source files that include swap.h - */ -extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); unsigned long arch_max_swapfile_size(void); diff --git a/include/linux/zswap.h b/include/linux/zswap.h new file mode 100644 index 000000000000..850c377d9b6d --- /dev/null +++ b/include/linux/zswap.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ZSWAP_H +#define _LINUX_ZSWAP_H + +#include +#include + +extern u64 zswap_pool_total_size; +extern atomic_t zswap_stored_pages; + +#ifdef CONFIG_ZSWAP + +bool zswap_store(struct page *page); +bool zswap_load(struct page *page); +void zswap_invalidate(int type, pgoff_t offset); +void zswap_swapon(int type); +void zswap_swapoff(int type); + +#else + +static inline bool zswap_store(struct page *page) +{ + return false; +} + +static inline bool zswap_load(struct page *page) +{ + return false; +} + +static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_swapon(int type) {} +static inline void zswap_swapoff(int type) {} + +#endif + +#endif /* _LINUX_ZSWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 1959d048bbf5..5fe49c030961 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -25,7 +25,6 @@ menuconfig SWAP config ZSWAP bool "Compressed cache for swap pages" depends on SWAP - select FRONTSWAP select CRYPTO select ZPOOL help @@ -873,9 +872,6 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool -config FRONTSWAP - bool - config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index 678530a07326..e6d9a1d5e84d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -72,7 +72,6 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o -obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/frontswap.c b/mm/frontswap.c deleted file mode 100644 index 2fb5df3384b8..000000000000 --- a/mm/frontswap.c +++ /dev/null @@ -1,283 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Frontswap frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of frontswap. See - * Documentation/mm/frontswap.rst for more information. - * - * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); - -/* - * frontswap_ops are added by frontswap_register_ops, and provide the - * frontswap "backend" implementation functions. Multiple implementations - * may be registered, but implementations can never deregister. This - * is a simple singly-linked list of all registered implementations. - */ -static const struct frontswap_ops *frontswap_ops __read_mostly; - -#ifdef CONFIG_DEBUG_FS -/* - * Counters available via /sys/kernel/debug/frontswap (if debugfs is - * properly configured). These are for information only so are not protected - * against increment races. - */ -static u64 frontswap_loads; -static u64 frontswap_succ_stores; -static u64 frontswap_failed_stores; -static u64 frontswap_invalidates; - -static inline void inc_frontswap_loads(void) -{ - data_race(frontswap_loads++); -} -static inline void inc_frontswap_succ_stores(void) -{ - data_race(frontswap_succ_stores++); -} -static inline void inc_frontswap_failed_stores(void) -{ - data_race(frontswap_failed_stores++); -} -static inline void inc_frontswap_invalidates(void) -{ - data_race(frontswap_invalidates++); -} -#else -static inline void inc_frontswap_loads(void) { } -static inline void inc_frontswap_succ_stores(void) { } -static inline void inc_frontswap_failed_stores(void) { } -static inline void inc_frontswap_invalidates(void) { } -#endif - -/* - * Due to the asynchronous nature of the backends loading potentially - * _after_ the swap system has been activated, we have chokepoints - * on all frontswap functions to not call the backend until the backend - * has registered. - * - * This would not guards us against the user deciding to call swapoff right as - * we are calling the backend to initialize (so swapon is in action). - * Fortunately for us, the swapon_mutex has been taken by the callee so we are - * OK. The other scenario where calls to frontswap_store (called via - * swap_writepage) is racing with frontswap_invalidate_area (called via - * swapoff) is again guarded by the swap subsystem. - * - * While no backend is registered all calls to frontswap_[store|load| - * invalidate_area|invalidate_page] are ignored or fail. - * - * The time between the backend being registered and the swap file system - * calling the backend (via the frontswap_* functions) is indeterminate as - * frontswap_ops is not atomic_t (or a value guarded by a spinlock). - * That is OK as we are comfortable missing some of these calls to the newly - * registered backend. - * - * Obviously the opposite (unloading the backend) must be done after all - * the frontswap_[store|load|invalidate_area|invalidate_page] start - * ignoring or failing the requests. However, there is currently no way - * to unload a backend once it is registered. - */ - -/* - * Register operations for frontswap - */ -int frontswap_register_ops(const struct frontswap_ops *ops) -{ - if (frontswap_ops) - return -EINVAL; - - frontswap_ops = ops; - static_branch_inc(&frontswap_enabled_key); - return 0; -} - -/* - * Called when a swap device is swapon'd. - */ -void frontswap_init(unsigned type, unsigned long *map) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(sis == NULL); - - /* - * p->frontswap is a bitmap that we MUST have to figure out which page - * has gone in frontswap. Without it there is no point of continuing. - */ - if (WARN_ON(!map)) - return; - /* - * Irregardless of whether the frontswap backend has been loaded - * before this function or it will be later, we _MUST_ have the - * p->frontswap set to something valid to work properly. - */ - frontswap_map_set(sis, map); - - if (!frontswap_enabled()) - return; - frontswap_ops->init(type); -} - -static bool __frontswap_test(struct swap_info_struct *sis, - pgoff_t offset) -{ - if (sis->frontswap_map) - return test_bit(offset, sis->frontswap_map); - return false; -} - -static inline void __frontswap_set(struct swap_info_struct *sis, - pgoff_t offset) -{ - set_bit(offset, sis->frontswap_map); - atomic_inc(&sis->frontswap_pages); -} - -static inline void __frontswap_clear(struct swap_info_struct *sis, - pgoff_t offset) -{ - clear_bit(offset, sis->frontswap_map); - atomic_dec(&sis->frontswap_pages); -} - -/* - * "Store" data from a page to frontswap and associate it with the page's - * swaptype and offset. Page must be locked and in the swap cache. - * If frontswap already contains a page with matching swaptype and - * offset, the frontswap implementation may either overwrite the data and - * return success or invalidate the page from frontswap and return failure. - */ -int __frontswap_store(struct page *page) -{ - int ret = -1; - swp_entry_t entry = { .val = page_private(page), }; - int type = swp_type(entry); - struct swap_info_struct *sis = swap_info[type]; - pgoff_t offset = swp_offset(entry); - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(sis == NULL); - - /* - * If a dup, we must remove the old page first; we can't leave the - * old page no matter if the store of the new page succeeds or fails, - * and we can't rely on the new page replacing the old page as we may - * not store to the same implementation that contains the old page. - */ - if (__frontswap_test(sis, offset)) { - __frontswap_clear(sis, offset); - frontswap_ops->invalidate_page(type, offset); - } - - ret = frontswap_ops->store(type, offset, page); - if (ret == 0) { - __frontswap_set(sis, offset); - inc_frontswap_succ_stores(); - } else { - inc_frontswap_failed_stores(); - } - - return ret; -} - -/* - * "Get" data from frontswap associated with swaptype and offset that were - * specified when the data was put to frontswap and use it to fill the - * specified page with data. Page must be locked and in the swap cache. - */ -int __frontswap_load(struct page *page) -{ - int ret = -1; - swp_entry_t entry = { .val = page_private(page), }; - int type = swp_type(entry); - struct swap_info_struct *sis = swap_info[type]; - pgoff_t offset = swp_offset(entry); - bool exclusive = false; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(sis == NULL); - - if (!__frontswap_test(sis, offset)) - return -1; - - /* Try loading from each implementation, until one succeeds. */ - ret = frontswap_ops->load(type, offset, page, &exclusive); - if (ret == 0) { - inc_frontswap_loads(); - if (exclusive) { - SetPageDirty(page); - __frontswap_clear(sis, offset); - } - } - return ret; -} - -/* - * Invalidate any data from frontswap associated with the specified swaptype - * and offset so that a subsequent "get" will fail. - */ -void __frontswap_invalidate_page(unsigned type, pgoff_t offset) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(sis == NULL); - - if (!__frontswap_test(sis, offset)) - return; - - frontswap_ops->invalidate_page(type, offset); - __frontswap_clear(sis, offset); - inc_frontswap_invalidates(); -} - -/* - * Invalidate all data from frontswap associated with all offsets for the - * specified swaptype. - */ -void __frontswap_invalidate_area(unsigned type) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(sis == NULL); - - if (sis->frontswap_map == NULL) - return; - - frontswap_ops->invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - bitmap_zero(sis->frontswap_map, sis->max); -} - -static int __init init_frontswap(void) -{ -#ifdef CONFIG_DEBUG_FS - struct dentry *root = debugfs_create_dir("frontswap", NULL); - if (root == NULL) - return -ENXIO; - debugfs_create_u64("loads", 0444, root, &frontswap_loads); - debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores); - debugfs_create_u64("failed_stores", 0444, root, - &frontswap_failed_stores); - debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates); -#endif - return 0; -} - -module_init(init_frontswap); diff --git a/mm/page_io.c b/mm/page_io.c index ff4156a44d5d..5d0baba3578b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -19,12 +19,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include "swap.h" static void __end_swap_bio_write(struct bio *bio) @@ -195,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } - if (frontswap_store(&folio->page) == 0) { + if (zswap_store(&folio->page)) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); @@ -512,7 +512,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } delayacct_swapin_start(); - if (frontswap_load(page) == 0) { + if (zswap_load(page)) { SetPageUptodate(page); unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 346e22b8ae97..e04eb9c0482d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,13 +35,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -95,7 +95,7 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -struct swap_info_struct *swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -744,7 +744,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - frontswap_invalidate_page(si->type, offset); + zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; @@ -2343,11 +2343,10 @@ static void _enable_swap_info(struct swap_info_struct *p) static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info, - unsigned long *frontswap_map) + struct swap_cluster_info *cluster_info) { - if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_init(p->type, frontswap_map); + zswap_swapon(p->type); + spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -2390,7 +2389,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct swap_info_struct *p = NULL; unsigned char *swap_map; struct swap_cluster_info *cluster_info; - unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; @@ -2515,12 +2513,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; - frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); - frontswap_invalidate_area(p->type); - frontswap_map_set(p, NULL); + zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; @@ -2528,7 +2524,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); - kvfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); @@ -2995,7 +2990,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long maxpages; unsigned char *swap_map = NULL; struct swap_cluster_info *cluster_info = NULL; - unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3135,11 +3129,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap_unlock_inode; } - /* frontswap enabled? set up bit-per-page map for frontswap */ - if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), - sizeof(long), - GFP_KERNEL); if ((swap_flags & SWAP_FLAG_DISCARD) && p->bdev && bdev_max_discard_sectors(p->bdev)) { @@ -3192,16 +3181,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); + enable_swap_info(p, prio, swap_map, cluster_info); - pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", - (p->flags & SWP_PAGE_DISCARD) ? "c" : "", - (frontswap_map) ? "FS" : ""); + (p->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -3231,7 +3219,6 @@ bad_swap: spin_unlock(&swap_lock); vfree(swap_map); kvfree(cluster_info); - kvfree(frontswap_map); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/zswap.c b/mm/zswap.c index 258e4e17799a..be1b6417ef5c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -2,7 +2,7 @@ /* * zswap.c - zswap driver file * - * zswap is a backend for frontswap that takes pages that are in the process + * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,7 @@ #include #include #include - +#include #include #include #include @@ -1084,7 +1083,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * * This can be thought of as a "resumed writeback" of the page * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the frontswap_store() + * writeback path that was intercepted with the zswap_store() * in the first place. After the page has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. @@ -1224,13 +1223,11 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } -/********************************* -* frontswap hooks -**********************************/ -/* attempts to compress and store an single page */ -static int zswap_frontswap_store(unsigned type, pgoff_t offset, - struct page *page) +bool zswap_store(struct page *page) { + swp_entry_t swp = { .val = page_private(page), }; + int type = swp_type(swp); + pgoff_t offset = swp_offset(swp); struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; @@ -1238,23 +1235,22 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct obj_cgroup *objcg = NULL; struct zswap_pool *pool; struct zpool *zpool; - int ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; gfp_t gfp; + int ret; + + VM_WARN_ON_ONCE(!PageLocked(page)); + VM_WARN_ON_ONCE(!PageSwapCache(page)); /* THP isn't supported */ - if (PageTransHuge(page)) { - ret = -EINVAL; - goto reject; - } + if (PageTransHuge(page)) + return false; - if (!zswap_enabled || !tree) { - ret = -ENODEV; - goto reject; - } + if (!zswap_enabled || !tree) + return false; /* * XXX: zswap reclaim does not work with cgroups yet. Without a @@ -1262,10 +1258,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, * local cgroup limits. */ objcg = get_obj_cgroup_from_page(page); - if (objcg && !obj_cgroup_may_zswap(objcg)) { - ret = -ENOMEM; + if (objcg && !obj_cgroup_may_zswap(objcg)) goto reject; - } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1275,10 +1269,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } if (zswap_pool_reached_full) { - if (!zswap_can_accept()) { - ret = -ENOMEM; + if (!zswap_can_accept()) goto shrink; - } else + else zswap_pool_reached_full = false; } @@ -1286,7 +1279,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry = zswap_entry_cache_alloc(GFP_KERNEL); if (!entry) { zswap_reject_kmemcache_fail++; - ret = -ENOMEM; goto reject; } @@ -1303,17 +1295,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } - if (!zswap_non_same_filled_pages_enabled) { - ret = -EINVAL; + if (!zswap_non_same_filled_pages_enabled) goto freepage; - } /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); - if (!entry->pool) { - ret = -EINVAL; + if (!entry->pool) goto freepage; - } /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1333,19 +1321,17 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this - * case, frontswap actually does store and load page by page, there is no + * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done - * in one thread doing frontswap. + * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) { - ret = -EINVAL; + if (ret) goto put_dstmem; - } /* store */ zpool = zswap_find_zpool(entry); @@ -1381,15 +1367,12 @@ insert_entry: /* map */ spin_lock(&tree->lock); - do { - ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); - if (ret == -EEXIST) { - zswap_duplicate_entry++; - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, dupentry); - zswap_entry_put(tree, dupentry); - } - } while (ret == -EEXIST); + while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); + } if (entry->length) { spin_lock(&entry->pool->lru_lock); list_add(&entry->lru, &entry->pool->lru); @@ -1402,7 +1385,7 @@ insert_entry: zswap_update_total_size(); count_vm_event(ZSWPOUT); - return 0; + return true; put_dstmem: mutex_unlock(acomp_ctx->mutex); @@ -1412,23 +1395,20 @@ freepage: reject: if (objcg) obj_cgroup_put(objcg); - return ret; + return false; shrink: pool = zswap_pool_last_get(); if (pool) queue_work(shrink_wq, &pool->shrink_work); - ret = -ENOMEM; goto reject; } -/* - * returns 0 if the page was successfully decompressed - * return -1 on entry not found or error -*/ -static int zswap_frontswap_load(unsigned type, pgoff_t offset, - struct page *page, bool *exclusive) +bool zswap_load(struct page *page) { + swp_entry_t swp = { .val = page_private(page), }; + int type = swp_type(swp); + pgoff_t offset = swp_offset(swp); struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; struct scatterlist input, output; @@ -1436,15 +1416,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, u8 *src, *dst, *tmp; struct zpool *zpool; unsigned int dlen; - int ret; + bool ret; + + VM_WARN_ON_ONCE(!PageLocked(page)); /* find */ spin_lock(&tree->lock); entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { - /* entry was written back */ spin_unlock(&tree->lock); - return -1; + return false; } spin_unlock(&tree->lock); @@ -1452,7 +1433,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); zswap_fill_page(dst, entry->value); kunmap_atomic(dst); - ret = 0; + ret = true; goto stats; } @@ -1460,7 +1441,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, if (!zpool_can_sleep_mapped(zpool)) { tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { - ret = -ENOMEM; + ret = false; goto freeentry; } } @@ -1481,7 +1462,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)) + WARN_ON(1); mutex_unlock(acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) @@ -1489,16 +1471,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, else kfree(tmp); - BUG_ON(ret); + ret = true; stats: count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); freeentry: spin_lock(&tree->lock); - if (!ret && zswap_exclusive_loads_enabled) { + if (ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); - *exclusive = true; + SetPageDirty(page); } else if (entry->length) { spin_lock(&entry->pool->lru_lock); list_move(&entry->lru, &entry->pool->lru); @@ -1510,8 +1492,7 @@ freeentry: return ret; } -/* frees an entry in zswap */ -static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +void zswap_invalidate(int type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; @@ -1528,8 +1509,22 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) spin_unlock(&tree->lock); } -/* frees all zswap entries for the given swap type */ -static void zswap_frontswap_invalidate_area(unsigned type) +void zswap_swapon(int type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; +} + +void zswap_swapoff(int type) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *n; @@ -1547,29 +1542,6 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static void zswap_frontswap_init(unsigned type) -{ - struct zswap_tree *tree; - - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { - pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; - } - - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; -} - -static const struct frontswap_ops zswap_frontswap_ops = { - .store = zswap_frontswap_store, - .load = zswap_frontswap_load, - .invalidate_page = zswap_frontswap_invalidate_page, - .invalidate_area = zswap_frontswap_invalidate_area, - .init = zswap_frontswap_init -}; - /********************************* * debugfs functions **********************************/ @@ -1658,16 +1630,11 @@ static int zswap_setup(void) if (!shrink_wq) goto fallback_fail; - ret = frontswap_register_ops(&zswap_frontswap_ops); - if (ret) - goto destroy_wq; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -destroy_wq: - destroy_workqueue(shrink_wq); fallback_fail: if (pool) zswap_pool_destroy(pool); -- cgit v1.2.3 From 34f4c198bfbe86612c368eb122002787acecaa93 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:40 +0100 Subject: zswap: make zswap_store() take a folio Patch series "Followup folio conversions for zswap". With frontswap killed, it's worth converting the zswap_load() and zswap_store() functions to take a folio instead of a page pointer. They aren't converted to support large folios, but there are a lot of unnecessary calls to compound_head() that are removed by these patches. This patch (of 4): Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. This does remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230715042343.434588-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- mm/page_io.c | 2 +- mm/zswap.c | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 850c377d9b6d..9f318c8bc367 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -10,7 +10,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP -bool zswap_store(struct page *page); +bool zswap_store(struct folio *folio); bool zswap_load(struct page *page); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); @@ -18,7 +18,7 @@ void zswap_swapoff(int type); #else -static inline bool zswap_store(struct page *page) +static inline bool zswap_store(struct folio *folio) { return false; } diff --git a/mm/page_io.c b/mm/page_io.c index 5d0baba3578b..ac89685b562b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -195,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } - if (zswap_store(&folio->page)) { + if (zswap_store(folio)) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); diff --git a/mm/zswap.c b/mm/zswap.c index be1b6417ef5c..df3054e6a3a9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1223,11 +1223,12 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } -bool zswap_store(struct page *page) +bool zswap_store(struct folio *folio) { - swp_entry_t swp = { .val = page_private(page), }; + swp_entry_t swp = folio_swap_entry(folio); int type = swp_type(swp); pgoff_t offset = swp_offset(swp); + struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; @@ -1242,11 +1243,11 @@ bool zswap_store(struct page *page) gfp_t gfp; int ret; - VM_WARN_ON_ONCE(!PageLocked(page)); - VM_WARN_ON_ONCE(!PageSwapCache(page)); + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); - /* THP isn't supported */ - if (PageTransHuge(page)) + /* Large folios aren't supported */ + if (folio_test_large(folio)) return false; if (!zswap_enabled || !tree) -- cgit v1.2.3 From 074e3e262adb02a7a42e32e0aadfb6b6cf416854 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:41 +0100 Subject: memcg: convert get_obj_cgroup_from_page to get_obj_cgroup_from_folio As the one caller now has a folio, pass it in and use it. Removes three calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 8 ++++---- mm/zswap.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 58eb7ca65699..058fb748e128 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1759,7 +1759,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); -struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); +struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); @@ -1843,7 +1843,7 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } -static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51772df1abc5..062d925336cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3036,21 +3036,21 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) return objcg; } -struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; if (!memcg_kmem_online()) return NULL; - if (PageMemcgKmem(page)) { - objcg = __folio_objcg(page_folio(page)); + if (folio_memcg_kmem(folio)) { + objcg = __folio_objcg(folio); obj_cgroup_get(objcg); } else { struct mem_cgroup *memcg; rcu_read_lock(); - memcg = __folio_memcg(page_folio(page)); + memcg = __folio_memcg(folio); if (memcg) objcg = __get_obj_cgroup_from_memcg(memcg); else diff --git a/mm/zswap.c b/mm/zswap.c index df3054e6a3a9..9df33298f2dc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1258,7 +1258,7 @@ bool zswap_store(struct folio *folio) * cgroup-aware entry LRU, we will push out entries system-wide based on * local cgroup limits. */ - objcg = get_obj_cgroup_from_page(page); + objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) goto reject; -- cgit v1.2.3 From fbcec6a3a09b309900f1ecef8954721d93555abd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:42 +0100 Subject: swap: remove some calls to compound_head() in swap_readpage() Replace six implicit calls to compound_head() with one call to page_folio(). Link: https://lkml.kernel.org/r/20230715042343.434588-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/page_io.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index ac89685b562b..e3d62c1a0834 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -492,14 +492,15 @@ static void swap_readpage_bdev_async(struct page *page, void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) { + struct folio *folio = page_folio(page); struct swap_info_struct *sis = page_swap_info(page); - bool workingset = PageWorkingset(page); + bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; - VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageUptodate(page), page); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); /* * Count submission time as memory stall and delay. When the device @@ -513,8 +514,8 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) delayacct_swapin_start(); if (zswap_load(page)) { - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { -- cgit v1.2.3 From ca54f6d89d60abf3e7dea68c95dfd442eeece212 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:43 +0100 Subject: zswap: make zswap_load() take a folio Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. Removes three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- mm/page_io.c | 2 +- mm/zswap.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 9f318c8bc367..2a60ce39cfde 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -11,7 +11,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP bool zswap_store(struct folio *folio); -bool zswap_load(struct page *page); +bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); @@ -23,7 +23,7 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct page *page) +static inline bool zswap_load(struct folio *folio) { return false; } diff --git a/mm/page_io.c b/mm/page_io.c index e3d62c1a0834..fe4c21af23f2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -513,7 +513,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } delayacct_swapin_start(); - if (zswap_load(page)) { + if (zswap_load(folio)) { folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { diff --git a/mm/zswap.c b/mm/zswap.c index 9df33298f2dc..7cc4a2baa713 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1405,11 +1405,12 @@ shrink: goto reject; } -bool zswap_load(struct page *page) +bool zswap_load(struct folio *folio) { - swp_entry_t swp = { .val = page_private(page), }; + swp_entry_t swp = folio_swap_entry(folio); int type = swp_type(swp); pgoff_t offset = swp_offset(swp); + struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; struct scatterlist input, output; @@ -1419,7 +1420,7 @@ bool zswap_load(struct page *page) unsigned int dlen; bool ret; - VM_WARN_ON_ONCE(!PageLocked(page)); + VM_WARN_ON_ONCE(!folio_test_locked(folio)); /* find */ spin_lock(&tree->lock); @@ -1481,7 +1482,7 @@ freeentry: spin_lock(&tree->lock); if (ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); - SetPageDirty(page); + folio_mark_dirty(folio); } else if (entry->length) { spin_lock(&entry->pool->lru_lock); list_move(&entry->lru, &entry->pool->lru); -- cgit v1.2.3 From d981e2804c92b505e76f44e66909f3ae805d3aa2 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:11 +0800 Subject: mm/page_ext: use page_ext_data helper in page_table_check Use page_ext_data helper in page_table_check to avoid access offset directly. Link: https://lkml.kernel.org/r/20230718145812.1991717-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/page_table_check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 84c8163984e5..46e77c12c81e 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -51,7 +51,7 @@ struct page_ext_operations page_table_check_ops = { static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); - return (void *)(page_ext) + page_table_check_ops.offset; + return page_ext_data(page_ext, &page_table_check_ops); } /* -- cgit v1.2.3 From 1cac4c0760ecd0c33b11b7b5c609264ea6bed5ed Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:12 +0800 Subject: mm/page_ext: use page_ext_data helper in page_owner Use page_ext_data helper in page_owner to avoid access offset directly. Link: https://lkml.kernel.org/r/20230718145812.1991717-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index c93baef0148f..4e2723e1b300 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -104,7 +104,7 @@ struct page_ext_operations page_owner_ops = { static inline struct page_owner *get_page_owner(struct page_ext *page_ext) { - return (void *)page_ext + page_owner_ops.offset; + return page_ext_data(page_ext, &page_owner_ops); } static noinline depot_stack_handle_t save_stack(gfp_t flags) -- cgit v1.2.3 From 56c67049c0ee135868e83ce36ac8d3e037e03f82 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:23 -0400 Subject: mm: zswap: use zswap_invalidate_entry() for duplicates Patch series "mm: zswap: three cleanups". Three small cleanups to zswap, the first one suggested by Yosry during the frontswap removal. This patch (of 3): Minor cleanup. Instead of open-coding the tree deletion and the put, use the zswap_invalidate_entry() convenience helper. Link: https://lkml.kernel.org/r/20230727162343.1415598-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20230727162343.1415598-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Nhat Pham Cc: Barry Song Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 7cc4a2baa713..93707a1799b8 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1370,9 +1370,7 @@ insert_entry: spin_lock(&tree->lock); while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_duplicate_entry++; - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, dupentry); - zswap_entry_put(tree, dupentry); + zswap_invalidate_entry(tree, dupentry); } if (entry->length) { spin_lock(&entry->pool->lru_lock); -- cgit v1.2.3 From 7310895779624a11153d74a2132f01be6e360b7c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:24 -0400 Subject: mm: zswap: tighten up entry invalidation Removing a zswap entry from the tree is tied to an explicit operation that's supposed to drop the base reference: swap invalidation, exclusive load, duplicate store. Don't silently remove the entry on final put, but instead warn if an entry is in tree without reference. While in that diff context, convert a BUG_ON to a WARN_ON_ONCE. No need to crash on a refcount underflow. Link: https://lkml.kernel.org/r/20230727162343.1415598-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Yosry Ahmed Cc: Barry Song Cc: Domenico Cerasuolo Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 93707a1799b8..ea921b25c245 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -417,9 +417,9 @@ static void zswap_entry_put(struct zswap_tree *tree, { int refcount = --entry->refcount; - BUG_ON(refcount < 0); + WARN_ON_ONCE(refcount < 0); if (refcount == 0) { - zswap_rb_erase(&tree->rbroot, entry); + WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_free_entry(entry); } } -- cgit v1.2.3 From 98804a944a63237814257dd149a5b04d6b93489c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:25 -0400 Subject: mm: zswap: kill zswap_get_swap_cache_page() The __read_swap_cache_async() interface isn't more difficult to understand than what the helper abstracts. Save the indirection and a level of indentation for the primary work of the writeback func. Link: https://lkml.kernel.org/r/20230727162343.1415598-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Yosry Ahmed Cc: Vitaly Wool Cc: Barry Song Cc: Seth Jennings Cc: Domenico Cerasuolo Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 142 +++++++++++++++++++++++-------------------------------------- 1 file changed, 53 insertions(+), 89 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index ea921b25c245..8b6b1bc8a5f2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1039,43 +1039,6 @@ static int zswap_enabled_param_set(const char *val, /********************************* * writeback code **********************************/ -/* return enum for zswap_get_swap_cache_page */ -enum zswap_get_swap_ret { - ZSWAP_SWAPCACHE_NEW, - ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_FAIL, -}; - -/* - * zswap_get_swap_cache_page - * - * This is an adaption of read_swap_cache_async() - * - * This function tries to find a page with the given swap entry - * in the swapper_space address space (the swap cache). If the page - * is found, it is returned in retpage. Otherwise, a page is allocated, - * added to the swap cache, and returned in retpage. - * - * If success, the swap cache page is returned in retpage - * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache - * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, - * the new page is added to swapcache and locked - * Returns ZSWAP_SWAPCACHE_FAIL on error - */ -static int zswap_get_swap_cache_page(swp_entry_t entry, - struct page **retpage) -{ - bool page_was_allocated; - - *retpage = __read_swap_cache_async(entry, GFP_KERNEL, - NULL, 0, &page_was_allocated); - if (page_was_allocated) - return ZSWAP_SWAPCACHE_NEW; - if (!*retpage) - return ZSWAP_SWAPCACHE_FAIL; - return ZSWAP_SWAPCACHE_EXIST; -} - /* * Attempts to free an entry by adding a page to the swap cache, * decompressing the entry data into the page, and issuing a @@ -1096,7 +1059,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); - + bool page_was_allocated; u8 *src, *tmp = NULL; unsigned int dlen; int ret; @@ -1111,65 +1074,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache page */ - switch (zswap_get_swap_cache_page(swpentry, &page)) { - case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, + &page_was_allocated); + if (!page) { ret = -ENOMEM; goto fail; + } - case ZSWAP_SWAPCACHE_EXIST: - /* page is already in the swap cache, ignore for now */ + /* Found an existing page, we raced with load/swapin */ + if (!page_was_allocated) { put_page(page); ret = -EEXIST; goto fail; + } - case ZSWAP_SWAPCACHE_NEW: /* page is locked */ - /* - * Having a local reference to the zswap entry doesn't exclude - * swapping from invalidating and recycling the swap slot. Once - * the swapcache is secured against concurrent swapping to and - * from the slot, recheck that the entry is still current before - * writing. - */ - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(page_folio(page)); - ret = -ENOMEM; - goto fail; - } + /* + * Page is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot. Verify that the + * swap entry hasn't been invalidated and recycled behind our + * backs (our zswap_entry reference doesn't prevent that), to + * avoid overwriting a new swap page with old compressed data. + */ + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); + delete_from_swap_cache(page_folio(page)); + ret = -ENOMEM; + goto fail; + } + spin_unlock(&tree->lock); - /* decompress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - dlen = PAGE_SIZE; + /* decompress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + dlen = PAGE_SIZE; - src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; - zpool_unmap_handle(pool, entry->handle); - } + src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, entry->handle); + } - mutex_lock(acomp_ctx->mutex); - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - mutex_unlock(acomp_ctx->mutex); - - if (!zpool_can_sleep_mapped(pool)) - kfree(tmp); - else - zpool_unmap_handle(pool, entry->handle); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + mutex_unlock(acomp_ctx->mutex); + + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, entry->handle); - BUG_ON(ret); - BUG_ON(dlen != PAGE_SIZE); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); - /* page is up to date */ - SetPageUptodate(page); - } + /* page is up to date */ + SetPageUptodate(page); /* move it to the tail of the inactive list after end_writeback */ SetPageReclaim(page); @@ -1180,16 +1144,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zswap_written_back_pages++; return ret; + fail: if (!zpool_can_sleep_mapped(pool)) kfree(tmp); /* - * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may be happening concurrently. - * it is safe and okay to not free the entry. - * it is also okay to return !0 - */ + * If we get here because the page is already in swapcache, a + * load may be happening concurrently. It is safe and okay to + * not free the entry. It is also okay to return !0. + */ return ret; } -- cgit v1.2.3 From 6e412203eeae68b599fb0a0722961e68f90322df Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 27 Jul 2023 09:55:58 +0800 Subject: mm/memory.c: fix some kernel-doc comments Add description of @mas and @tree_end, remove @mt in unmap_vmas(). to silence the warnings: mm/memory.c:1837: warning: Function parameter or member 'mas' not described in 'unmap_vmas' mm/memory.c:1837: warning: Function parameter or member 'tree_end' not described in 'unmap_vmas' mm/memory.c:1837: warning: Excess function parameter 'mt' description in 'unmap_vmas' Link: https://lkml.kernel.org/r/20230727015558.69554-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5996 Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f06266464208..1113ee625a94 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,10 +1693,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mt: the maple tree + * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping + * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. -- cgit v1.2.3 From 5d7800d9cb9ad1cc98c414036f03cc029508d1c5 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:10 +0800 Subject: mm: kmsan: use helper function page_size() Patch series "minor cleanups for kmsan". Use helper function and macros to improve code readability. No functional modification involved. This patch (of 3): Use function page_size() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230727011612.2721843-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 2 +- mm/kmsan/shadow.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index ec0da72e65aa..4e3c3e60ba97 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -117,7 +117,7 @@ void kmsan_kfree_large(const void *ptr) page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); kmsan_internal_poison_memory((void *)ptr, - PAGE_SIZE << compound_order(page), + page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index b8bb95eea5e3..c7de991f6d7f 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -210,7 +210,7 @@ void kmsan_free_page(struct page *page, unsigned int order) return; kmsan_enter_runtime(); kmsan_internal_poison_memory(page_address(page), - PAGE_SIZE << compound_order(page), + page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); -- cgit v1.2.3 From 4852a80524937db8215406cf2b2431b14f320000 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:11 +0800 Subject: mm: kmsan: use helper macro offset_in_page() Use helper macro offset_in_page() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 2 +- mm/kmsan/shadow.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4e3c3e60ba97..5d6e2dee5692 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -339,7 +339,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, * internal KMSAN checks. */ while (size > 0) { - page_offset = addr % PAGE_SIZE; + page_offset = offset_in_page(addr); to_go = min(PAGE_SIZE - page_offset, (u64)size); kmsan_handle_dma_page((void *)addr, to_go, dir); addr += to_go; diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index c7de991f6d7f..966994268a01 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,7 +145,7 @@ void *kmsan_get_metadata(void *address, bool is_origin) return NULL; if (!page_has_metadata(page)) return NULL; - off = addr % PAGE_SIZE; + off = offset_in_page(addr); return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } -- cgit v1.2.3 From 108c3dc6cd3dbe36824469b7ea860337978e0439 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:12 +0800 Subject: mm: kmsan: use helper macros PAGE_ALIGN and PAGE_ALIGN_DOWN Use helper macros PAGE_ALIGN and PAGE_ALIGN_DOWN to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 966994268a01..87318f9170f1 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -281,8 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) struct page *page; u64 size; - start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); - size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + start = (void *)PAGE_ALIGN_DOWN((u64)start); + size = PAGE_ALIGN((u64)end - (u64)start); shadow = memblock_alloc(size, PAGE_SIZE); origin = memblock_alloc(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { -- cgit v1.2.3 From 866ff80176aa1f9c0ba65f2164cf608c5cde4851 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 27 Jul 2023 21:39:44 +0100 Subject: mm: improve the comment in isolate_migratepages_block() A recent patch shows that not everybody understands that "stabilise the mapping" really means "prevent the mapping from being freed", so change the wording to hopefully make that more clear. Link: https://lkml.kernel.org/r/ZMLWEB4m3zvX6SBN@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- mm/compaction.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e6ac0ef4c178..c4d3a3129fd5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1100,13 +1100,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool migrate_dirty; /* - * Only pages without mappings or that have a - * ->migrate_folio callback are possible to migrate - * without blocking. However, we can be racing with - * truncation so it's necessary to lock the page - * to stabilise the mapping as truncation holds - * the page lock until after the page is removed - * from the page cache. + * Only folios without mappings or that have + * a ->migrate_folio callback are possible to + * migrate without blocking. However, we may + * be racing with truncation, which can free + * the mapping. Truncation holds the folio lock + * until after the folio is removed from the page + * cache so holding it ourselves is sufficient. */ if (!folio_trylock(folio)) goto isolate_fail_put; -- cgit v1.2.3 From e7ee3f9791f5601fc032b222a70a02b9798784be Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 28 Jul 2023 06:21:57 +0900 Subject: damon: use pmdp_get instead of drectly dereferencing pmd As ptep_get, Use the pmdp_get wrapper when we accessing pmdval instead of directly dereferencing pmd. Link: https://lkml.kernel.org/r/20230727212157.2985025-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 23 +++++++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index e940802a15a4..ac1c3fa80f98 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -54,7 +54,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct folio *folio = damon_get_folio(pmd_pfn(*pmd)); + struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 40801e38fcf0..909db25efb35 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -94,7 +94,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(*pvmw.pmd) || + *accessed = pmd_young(pmdp_get(pvmw.pmd)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 2fcc9731528a..d01cc46f4bf4 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -301,16 +301,19 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pmd_t pmde; spinlock_t *ptl; - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmdp_get(pmd))) { ptl = pmd_lock(walk->mm, pmd); - if (!pmd_present(*pmd)) { + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmde)) { damon_pmdp_mkold(pmd, walk->vma, addr); spin_unlock(ptl); return 0; @@ -439,21 +442,25 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmdp_get(pmd))) { + pmd_t pmde; + ptl = pmd_lock(walk->mm, pmd); - if (!pmd_present(*pmd)) { + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } - if (!pmd_trans_huge(*pmd)) { + if (!pmd_trans_huge(pmde)) { spin_unlock(ptl); goto regular_page; } - folio = damon_get_folio(pmd_pfn(*pmd)); + folio = damon_get_folio(pmd_pfn(pmde)); if (!folio) goto huge_out; - if (pmd_young(*pmd) || !folio_test_idle(folio) || + if (pmd_young(pmde) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; -- cgit v1.2.3 From c456832e6a8d8bcdfde4e8b3f66895aaffbd2832 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:25 +0800 Subject: mm/page_poison: remove unused page_ext.h from page_poison Patch series "minor cleanups to page_ext header". No page_ext function or structure is used in page_poison. Just remove page_ext header from page_poison. Link: https://lkml.kernel.org/r/20230717113227.1897173-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230717113227.1897173-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- mm/page_poison.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/page_poison.c b/mm/page_poison.c index 98438985e1ed..b4f456437b7e 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From c6493f4bd789eafc918a5e210e80256e2284a7c0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:26 +0800 Subject: mm/vmstat: remove unused page_ext.h from vmstat No page_ext function or structure is used in vmstat. Just remove page_ext header from vmstat. Link: https://lkml.kernel.org/r/20230717113227.1897173-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index b731d57996c5..00e81e99c6ee 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From ebddd111fcd13fefd7350f77201dfc5605672909 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 1 Aug 2023 20:37:23 +0800 Subject: mm/page_alloc: avoid unneeded alike_pages calculation When free_pages is 0, alike_pages is not used. So alike_pages calculation can be avoided by checking free_pages early to save cpu cycles. Also fix typo 'comparable'. It should be 'compatible' here. Link: https://lkml.kernel.org/r/20230801123723.2225543-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1eb3864e1dbc..8b17dcbb925d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1833,6 +1833,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, free_pages = move_freepages_block(zone, page, start_type, &movable_pages); + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + /* * Determine how many pages are compatible with our allocation. * For movable allocation, it's the number of movable pages which @@ -1854,14 +1858,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, else alike_pages = 0; } - - /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) - goto single_page; - /* * If a sufficient number of pages in the block are either free or of - * comparable migratability as our allocation, claim the whole block. + * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) -- cgit v1.2.3 From 2a158e956b98e0c5f37b5ce9953048654348ad6b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 29 Jul 2023 20:37:33 +0000 Subject: mm/damon/core-test: add a test for damos_new_filter() damos_new_filter() was having a bug that not initializing ->list field of the returning damos_filter struct, which results in access to uninitialized memory. Add a unit test for the function. Link: https://lkml.kernel.org/r/20230729203733.38949-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index bb07721909e1..4bddbfe243c3 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -341,6 +341,18 @@ static void damon_test_set_attrs(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } +static void damos_test_new_filter(struct kunit *test) +{ + struct damos_filter *filter; + + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); + KUNIT_EXPECT_EQ(test, filter->matching, true); + KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); + KUNIT_EXPECT_PTR_EQ(test, filter->list.next, &filter->list); + damos_destroy_filter(filter); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -353,6 +365,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_regions), KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), + KUNIT_CASE(damos_test_new_filter), {}, }; -- cgit v1.2.3 From 0388536ac29104a478c79b3869541524caec28eb Mon Sep 17 00:00:00 2001 From: Efly Young Date: Fri, 21 Jul 2023 09:41:16 +0800 Subject: mm:vmscan: fix inaccurate reclaim during proactive reclaim Before commit f53af4285d77 ("mm: vmscan: fix extreme overreclaim and swap floods"), proactive reclaim will extreme overreclaim sometimes. But proactive reclaim still inaccurate and some extent overreclaim. Problematic case is easy to construct. Allocate lots of anonymous memory (e.g., 20G) in a memcg, then swapping by writing memory.recalim and there is a certain probability of overreclaim. For example, request 1G by writing memory.reclaim will eventually reclaim 1.7G or other values more than 1G. The reason is that reclaimer may have already reclaimed part of requested memory in one loop, but before adjust sc->nr_to_reclaim in outer loop, call shrink_lruvec() again will still follow the current sc->nr_to_reclaim to work. It will eventually lead to overreclaim. In theory, the amount of reclaimed would be in [request, 2 * request). Reclaimer usually tends to reclaim more than request. But either direct or kswapd reclaim have much smaller nr_to_reclaim targets, so it is less noticeable and not have much impact. Proactive reclaim can usually come in with a larger value, so the error is difficult to ignore. Considering proactive reclaim is usually low frequency, handle the batching into smaller chunks is a better approach. Link: https://lkml.kernel.org/r/20230721014116.3388-1-yangyifei03@kuaishou.com Signed-off-by: Efly Young Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 062d925336cb..56abc4f426f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6686,8 +6686,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; -- cgit v1.2.3 From 669281ee7ef731fb5204df9d948669bf32a5e68d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:02 -0700 Subject: Multi-gen LRU: fix per-zone reclaim MGLRU has a LRU list for each zone for each type (anon/file) in each generation: long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; The min_seq (oldest generation) can progress independently for each type but the max_seq (youngest generation) is shared for both anon and file. This is to maintain a common frame of reference. In order for eviction to advance the min_seq of a type, all the per-zone lists in the oldest generation of that type must be empty. The eviction logic only considers pages from eligible zones for eviction or promotion. scan_folios() { ... for (zone = sc->reclaim_idx; zone >= 0; zone--) { ... sort_folio(); // Promote ... isolate_folio(); // Evict } ... } Consider the system has the movable zone configured and default 4 generations. The current state of the system is as shown below (only illustrating one type for simplicity): Type: ANON Zone DMA32 Normal Movable Device Gen 0 0 0 4GB 0 Gen 1 0 1GB 1MB 0 Gen 2 1MB 4GB 1MB 0 Gen 3 1MB 1MB 1MB 0 Now consider there is a GFP_KERNEL allocation request (eligible zone index <= Normal), evict_folios() will return without doing any work since there are no pages to scan in the eligible zones of the oldest generation. Reclaim won't make progress until triggered from a ZONE_MOVABLE allocation request; which may not happen soon if there is a lot of free memory in the movable zone. This can lead to OOM kills, although there is 1GB pages in the Normal zone of Gen 1 that we have not yet tried to reclaim. This issue is not seen in the conventional active/inactive LRU since there are no per-zone lists. If there are no (not enough) folios to scan in the eligible zones, move folios from ineligible zone (zone_index > reclaim_index) to the next generation. This allows for the progression of min_seq and reclaiming from the next generation (Gen 1). Qualcomm, Mediatek and raspberrypi [1] discovered this issue independently. [1] https://github.com/raspberrypi/linux/issues/5395 Link: https://lkml.kernel.org/r/20230802025606.346758-1-kaleshsingh@google.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Kalesh Singh Reported-by: Charan Teja Kalla Reported-by: Lecopzer Chen Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: Aneesh Kumar K V Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4039620d30fe..489a4fc7d9b1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4889,7 +4889,8 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec) * the eviction ******************************************************************************/ -static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, + int tier_idx) { bool success; int gen = folio_lru_gen(folio); @@ -4939,6 +4940,13 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) return true; } + /* ineligible */ + if (zone > sc->reclaim_idx) { + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + /* waiting for writeback */ if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { @@ -4987,7 +4995,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list) { - int gen, zone; + int i; + int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; @@ -5003,9 +5012,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, gen = lru_gen_from_seq(lrugen->min_seq[type]); - for (zone = sc->reclaim_idx; zone >= 0; zone--) { + for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); int skipped = 0; + int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { @@ -5019,7 +5029,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, scanned += delta; - if (sort_folio(lruvec, folio, tier)) + if (sort_folio(lruvec, folio, sc, tier)) sorted += delta; else if (isolate_folio(lruvec, folio, sc)) { list_add(&folio->lru, list); -- cgit v1.2.3 From bb5e7f234eacf34b65be67ebb3613e3b8cf11b87 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:03 -0700 Subject: Multi-gen LRU: avoid race in inc_min_seq() inc_max_seq() will try to inc_min_seq() if nr_gens == MAX_NR_GENS. This is because the generations are reused (the last oldest now empty generation will become the next youngest generation). inc_min_seq() is retried until successful, dropping the lru_lock and yielding the CPU on each failure, and retaking the lock before trying again: while (!inc_min_seq(lruvec, type, can_swap)) { spin_unlock_irq(&lruvec->lru_lock); cond_resched(); spin_lock_irq(&lruvec->lru_lock); } However, the initial condition that required incrementing the min_seq (nr_gens == MAX_NR_GENS) is not retested. This can change by another call to inc_max_seq() from run_aging() with force_scan=true from the debugfs interface. Since the eviction stalls when the nr_gens == MIN_NR_GENS, avoid unnecessarily incrementing the min_seq by rechecking the number of generations before each attempt. This issue was uncovered in previous discussion on the list by Yu Zhao and Aneesh Kumar [1]. [1] https://lore.kernel.org/linux-mm/CAOUHufbO7CaVm=xjEb1avDhHVvnC8pJmGyKcFf2iY_dpf+zR3w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230802025606.346758-2-kaleshsingh@google.com Fixes: d6c3af7d8a2b ("mm: multi-gen LRU: debugfs interface") Signed-off-by: Kalesh Singh Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Aneesh Kumar K V Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Lecopzer Chen Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 489a4fc7d9b1..6eecd291756c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4439,7 +4439,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; - +restart: spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4450,11 +4450,12 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - while (!inc_min_seq(lruvec, type, can_swap)) { - spin_unlock_irq(&lruvec->lru_lock); - cond_resched(); - spin_lock_irq(&lruvec->lru_lock); - } + if (inc_min_seq(lruvec, type, can_swap)) + continue; + + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + goto restart; } /* -- cgit v1.2.3 From a3235ea2a88b7874204c39ebb20feb712f4dba9d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:04 -0700 Subject: Multi-gen LRU: fix can_swap in lru_gen_look_around() walk->can_swap might be invalid since it's not guaranteed to be initialized for the particular lruvec. Instead deduce it from the folio type (anon/file). Link: https://lkml.kernel.org/r/20230802025606.346758-3-kaleshsingh@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Kalesh Singh Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Aneesh Kumar K V Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Lecopzer Chen Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 6eecd291756c..b4329f93a682 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4656,6 +4656,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); + bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); @@ -4704,7 +4705,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!pte_young(ptent)) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; -- cgit v1.2.3 From b69f92a741405336fb17a8a3d67fc144192fe8e2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:17 +0000 Subject: mm/damon/sysfs-schemes: implement DAMOS tried total bytes file Patch series "mm/damon/sysfs-schemes: implement DAMOS tried total bytes file". The tried_regions directory of DAMON sysfs interface is useful for retrieving monitoring results snapshot or DAMOS debugging. However, for common use case that need to monitor only the total size of the scheme tried regions (e.g., monitoring working set size), the kernel overhead for directory construction and user overhead for reading the content could be high if the number of monitoring region is not small. This patchset implements DAMON sysfs files for efficient support of the use case. The first patch implements the sysfs file to reduce the user space overhead, and the second patch implements a command for reducing the kernel space overhead. The third patch adds a selftest for the new file, and following two patches update documents. [1] https://lore.kernel.org/damon/20230728201817.70602-1-sj@kernel.org/ This patch (of 5): The tried_regions directory can be used for retrieving the monitoring results snapshot for regions of specific access pattern, by setting the scheme's action as 'stat' and the access pattern as required. While the interface provides every detail of the monitoring results, some use cases including working set size monitoring requires only the total size of the regions. For such cases, users should read all the information and calculate the total size of the regions. However, it could incur high overhead if the number of regions is high. Add a file for retrieving only the information, namely 'total_bytes' file. It allows users to get the total size by reading only the file. Link: https://lkml.kernel.org/r/20230802213222.109841-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802213222.109841-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 50cf89dcd898..6d3462eb31f2 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; + unsigned long total_bytes; }; static struct damon_sysfs_scheme_regions * @@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void) regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; + regions->total_bytes = 0; return regions; } +static ssize_t total_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_regions *regions = container_of(kobj, + struct damon_sysfs_scheme_regions, kobj); + + return sysfs_emit(buf, "%lu\n", regions->total_bytes); +} + static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_regions *regions) { @@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj) kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); } +static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr = + __ATTR_RO_MODE(total_bytes, 0400); + static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + &damon_sysfs_scheme_regions_total_bytes_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); @@ -1648,6 +1663,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + sysfs_regions->total_bytes += r->ar.end - r->ar.start; region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1678,6 +1694,7 @@ int damon_sysfs_schemes_clear_regions( sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); + sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } -- cgit v1.2.3 From 6ad243b83b5094026fdb3171711ddb25246b3d8a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:18 +0000 Subject: mm/damon/sysfs: implement a command for updating only schemes tried total bytes Using tried_regions/total_bytes file, users can efficiently retrieve the total size of memory regions having specific access pattern. However, DAMON sysfs interface in kernel still populates all the infomration on the tried_regions subdirectories. That means the kernel part overhead for the construction of tried regions directories still exists. To remove the overhead, implement yet another command input for 'state' DAMON sysfs file. Writing the input to the file makes DAMON sysfs interface to update only the total_bytes file. Link: https://lkml.kernel.org/r/20230802213222.109841-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 +- mm/damon/sysfs-schemes.c | 7 ++++++- mm/damon/sysfs.c | 26 ++++++++++++++++++++------ 3 files changed, 27 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index db677eba78fd..fd482a0639b4 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats( int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_ctx *ctx, bool total_bytes_only); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6d3462eb31f2..9a015079f3a4 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1635,6 +1635,7 @@ void damon_sysfs_schemes_update_stats( */ static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; static int damon_sysfs_schemes_region_idx; +static bool damos_regions_upd_total_bytes_only; /* * DAMON callback that called before damos apply. While this callback is @@ -1664,6 +1665,9 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; sysfs_regions->total_bytes += r->ar.end - r->ar.start; + if (damos_regions_upd_total_bytes_only) + return 0; + region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1702,10 +1706,11 @@ int damon_sysfs_schemes_clear_regions( /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_ctx *ctx, bool total_bytes_only) { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; return 0; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 33e1d5c9cb54..b86ba7b0a921 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -999,6 +999,11 @@ enum damon_sysfs_cmd { * files. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update + * tried_regions/total_bytes sysfs files for each scheme. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES, /* * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried * regions @@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_bytes", "update_schemes_tried_regions", "clear_schemes_tried_regions", }; @@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; struct damon_sysfs_kdamond *kdamond; + enum damon_sysfs_cmd cmd; /* damon_sysfs_schemes_update_regions_stop() might not yet called */ kdamond = damon_sysfs_cmd_request.kdamond; - if (kdamond && damon_sysfs_cmd_request.cmd == - DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && - ctx == kdamond->damon_ctx) { + cmd = damon_sysfs_cmd_request.cmd; + if (kdamond && ctx == kdamond->damon_ctx && + (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || + cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) { damon_sysfs_schemes_update_regions_stop(ctx); mutex_unlock(&damon_sysfs_lock); } @@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) } static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond) + struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) { struct damon_ctx *ctx = kdamond->damon_ctx; if (!ctx) return -EINVAL; return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx); + kdamond->contexts->contexts_arr[0]->schemes, ctx, + total_bytes_only); } static int damon_sysfs_upd_schemes_regions_stop( @@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; static bool damon_sysfs_schemes_regions_updating; + bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ @@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + total_bytes_only = true; + fallthrough; case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond); + err = damon_sysfs_upd_schemes_regions_start(kdamond, + total_bytes_only); if (!err) { damon_sysfs_schemes_regions_updating = true; goto keep_lock_out; -- cgit v1.2.3 From ab9bda001b681c293fb72ef21f083adfbcd78028 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:00 +0000 Subject: mm/damon/core: introduce address range type damos filter Patch series "Extend DAMOS filters for address ranges and DAMON monitoring targets" There are use cases that need to apply DAMOS schemes to specific address ranges or DAMON monitoring targets. NUMA nodes in the physical address space, special memory objects in the virtual address space, and monitoring target specific efficient monitoring results snapshot retrieval could be examples of such use cases. This patchset extends DAMOS filters feature for such cases, by implementing two more filter types, namely address ranges and DAMON monitoring types. Patches sequence ---------------- The first seven patches are for the address ranges based DAMOS filter. The first patch implements the filter feature and expose it via DAMON kernel API. The second patch further expose the feature to users via DAMON sysfs interface. The third and fourth patches implement unit tests and selftests for the feature. Three patches (fifth to seventh) updating the documents follow. The following six patches are for the DAMON monitoring target based DAMOS filter. The eighth patch implements the feature in the core layer and expose it via DAMON's kernel API. The ninth patch further expose it to users via DAMON sysfs interface. Tenth patch add a selftest, and two patches (eleventh and twelfth) update documents. [1] https://lore.kernel.org/damon/20230728203444.70703-1-sj@kernel.org/ This patch (of 13): Users can know special characteristic of specific address ranges. NUMA nodes or special objects or buffers in virtual address space could be such examples. For such cases, DAMOS schemes could required to be applied to only specific address ranges. Implement yet another type of DAMOS filter for the purpose. Note that the existing filter types, namely anon pages and memcg DAMOS filters needed page level type check. Because such check can be done efficiently in the opertions set layer, those filters are handled in operations set layer. Specifically, only paddr operations set implementation supports these filters. Also, because statistics counting is done in the DAMON core layer, the regions that filtered out by these filters are counted as tried but failed to the statistics. Unlike those, address range based filters can efficiently handled in the core layer. Hence, do the handling in the layer, and count the regions that filtered out by those as the scheme has not tried for the region. This difference should clearly documented. Link: https://lkml.kernel.org/r/20230802214312.110532-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802214312.110532-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 22 ++++++++++++++++------ mm/damon/core.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index d5d4d19928e0..476f37a883a4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -226,16 +226,24 @@ struct damos_stat { * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @DAMOS_FILTER_TYPE_ADDR: Address range. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * - * The support of each filter type is up to running &struct damon_operations. - * &enum DAMON_OPS_PADDR is supporting all filter types, while - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any - * filter types. + * The anon pages type and memcg type filters are handled by underlying + * &struct damon_operations as a part of scheme action trying, and therefore + * accounted as 'tried'. In contrast, other types are handled by core layer + * before trying of the action and therefore not accounted as 'tried'. + * + * The support of the filters that handled by &struct damon_operations depend + * on the running &struct damon_operations. + * &enum DAMON_OPS_PADDR supports both anon pages type and memcg type filters, + * while &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR don't support any of + * the two types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, + DAMOS_FILTER_TYPE_ADDR, NR_DAMOS_FILTER_TYPES, }; @@ -244,18 +252,20 @@ enum damos_filter_type { * @type: Type of the page. * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each * page of the region matches to this and avoid applying the action if so. - * Note that the check support is up to &struct damon_operations - * implementation. + * Support of each filter type depends on the running &struct damon_operations + * and the type. Refer to &enum damos_filter_type for more detai. */ struct damos_filter { enum damos_filter_type type; bool matching; union { unsigned short memcg_id; + struct damon_addr_range addr_range; }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 91cff7f2997e..68a5fb1c039d 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -877,6 +877,56 @@ static void damos_update_stat(struct damos *s, s->stat.sz_applied += sz_applied; } +static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, + struct damos_filter *filter) +{ + bool matched = false; + unsigned long start, end; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ADDR: + start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); + end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + + /* inside the range */ + if (start <= r->ar.start && r->ar.end <= end) { + matched = true; + break; + } + /* outside of the range */ + if (r->ar.end <= start || end <= r->ar.start) { + matched = false; + break; + } + /* start before the range and overlap */ + if (r->ar.start < start) { + damon_split_region_at(t, r, start - r->ar.start); + matched = false; + break; + } + /* start inside the range */ + damon_split_region_at(t, r, end - r->ar.start); + matched = true; + break; + default: + break; + } + + return matched == filter->matching; +} + +static bool damos_filter_out(struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos_filter *filter; + + damos_for_each_filter(filter, s) { + if (__damos_filter_out(t, r, filter)) + return true; + } + return false; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -894,6 +944,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, goto update_stat; damon_split_region_at(t, r, sz); } + if (damos_filter_out(t, r, s)) + return; ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) err = c->callback.before_damos_apply(c, t, r, s); -- cgit v1.2.3 From 2f1abcfccd86826777b2bcb2bb4e0d149a90ccf5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:01 +0000 Subject: mm/damon/sysfs-schemes: support address range type DAMOS filter Extend DAMON sysfs interface to support address range based DAMOS filters, by adding a special keyword for the filter//type file, namely 'addr', and two files under filter// for specifying the start and the end addresses of the range, namely 'addr_start' and 'addr_end'. Link: https://lkml.kernel.org/r/20230802214312.110532-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 9a015079f3a4..03ddba3e216d 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -282,6 +282,7 @@ struct damon_sysfs_scheme_filter { enum damos_filter_type type; bool matching; char *memcg_path; + struct damon_addr_range addr_range; }; static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) @@ -293,6 +294,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", + "addr", }; static ssize_t type_show(struct kobject *kobj, @@ -373,6 +375,44 @@ static ssize_t memcg_path_store(struct kobject *kobj, return count; } +static ssize_t addr_start_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->addr_range.start); +} + +static ssize_t addr_start_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->addr_range.start); + + return err ? err : count; +} + +static ssize_t addr_end_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->addr_range.end); +} + +static ssize_t addr_end_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->addr_range.end); + + return err ? err : count; +} + static void damon_sysfs_scheme_filter_release(struct kobject *kobj) { struct damon_sysfs_scheme_filter *filter = container_of(kobj, @@ -391,10 +431,18 @@ static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr = + __ATTR_RW_MODE(addr_start, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr = + __ATTR_RW_MODE(addr_end, 0600); + static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, + &damon_sysfs_scheme_filter_addr_start_attr.attr, + &damon_sysfs_scheme_filter_addr_end_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); @@ -1484,7 +1532,15 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, damos_destroy_filter(filter); return err; } + } else if (filter->type == DAMOS_FILTER_TYPE_ADDR) { + if (sysfs_filter->addr_range.end < + sysfs_filter->addr_range.start) { + damos_destroy_filter(filter); + return -EINVAL; + } + filter->addr_range = sysfs_filter->addr_range; } + damos_add_filter(scheme, filter); } return 0; -- cgit v1.2.3 From 26713c8908752a7edca18dcafe88e36dccfb41a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:02 +0000 Subject: mm/damon/core-test: add a unit test for __damos_filter_out() Implement a kunit test for the core of address range DAMOS filter handling, namely __damos_filter_out(). The test especially focus on regions that overlap with given filter's target address range. Link: https://lkml.kernel.org/r/20230802214312.110532-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'mm') diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 4bddbfe243c3..6cc8b245586d 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -353,6 +353,66 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_filter_out(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2; + struct damos_filter *f; + + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f->addr_range = (struct damon_addr_range){ + .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; + + t = damon_new_target(); + r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5); + damon_add_region(r, t); + + /* region in the range */ + KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region before the range */ + r->ar.start = DAMON_MIN_REGION * 1; + r->ar.end = DAMON_MIN_REGION * 2; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region after the range */ + r->ar.start = DAMON_MIN_REGION * 6; + r->ar.end = DAMON_MIN_REGION * 8; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region started before the range */ + r->ar.start = DAMON_MIN_REGION * 1; + r->ar.end = DAMON_MIN_REGION * 4; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + /* filter should have split the region */ + KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); + KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); + r2 = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4); + damon_destroy_region(r2, t); + + /* region started in the range */ + r->ar.start = DAMON_MIN_REGION * 2; + r->ar.end = DAMON_MIN_REGION * 8; + KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + /* filter should have split the region */ + KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); + r2 = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8); + damon_destroy_region(r2, t); + + damon_free_target(t); + damos_free_filter(f); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -366,6 +426,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_filter_out), {}, }; -- cgit v1.2.3 From 17e7c724d3c2e622c4d9969b7a473e8ed1d14ff0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:07 +0000 Subject: mm/damon/core: implement target type damos filter One DAMON context can have multiple monitoring targets, and DAMOS schemes are applied to all targets. In some cases, users need to apply different scheme to different targets. Retrieving monitoring results via DAMON sysfs interface' 'tried_regions' directory could be one good example. Also, there could be cases that cgroup DAMOS filter is not enough. All such use cases can be worked around by having multiple DAMON contexts having only single target, but it is inefficient in terms of resource usage, thogh the overhead is not estimated to be huge. Implement DAMON monitoring target based DAMOS filter for the case. Like address range target DAMOS filter, handle these filters in the DAMON core layer, since it is more efficient than doing in operations set layer. This also means that regions that filtered out by monitoring target type DAMOS filters are counted as not tried by the scheme. Hence, target granularity monitoring results retrieval via DAMON sysfs interface becomes available. Link: https://lkml.kernel.org/r/20230802214312.110532-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index 476f37a883a4..ae2664d1d5f1 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -227,6 +227,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. + * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * * The anon pages type and memcg type filters are handled by underlying @@ -244,6 +245,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_ADDR, + DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, }; @@ -253,6 +255,9 @@ enum damos_filter_type { * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. + * @target_idx: Index of the &struct damon_target of + * &damon_ctx->adaptive_targets if @type is + * DAMOS_FILTER_TYPE_TARGET. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -266,6 +271,7 @@ struct damos_filter { union { unsigned short memcg_id; struct damon_addr_range addr_range; + int target_idx; }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 68a5fb1c039d..c1f1483c5082 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -877,13 +877,23 @@ static void damos_update_stat(struct damos *s, s->stat.sz_applied += sz_applied; } -static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, - struct damos_filter *filter) +static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos_filter *filter) { bool matched = false; + struct damon_target *ti; + int target_idx = 0; unsigned long start, end; switch (filter->type) { + case DAMOS_FILTER_TYPE_TARGET: + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + target_idx++; + } + matched = target_idx == filter->target_idx; + break; case DAMOS_FILTER_TYPE_ADDR: start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); @@ -915,13 +925,13 @@ static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, return matched == filter->matching; } -static bool damos_filter_out(struct damon_target *t, struct damon_region *r, - struct damos *s) +static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s) { struct damos_filter *filter; damos_for_each_filter(filter, s) { - if (__damos_filter_out(t, r, filter)) + if (__damos_filter_out(ctx, t, r, filter)) return true; } return false; @@ -944,7 +954,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, goto update_stat; damon_split_region_at(t, r, sz); } - if (damos_filter_out(t, r, s)) + if (damos_filter_out(c, t, r, s)) return; ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) -- cgit v1.2.3 From 9f6e47abfcb40c2f97f6987fca086ff463de2381 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:08 +0000 Subject: mm/damon/sysfs-schemes: support target damos filter Extend DAMON sysfs interface to support the DAMON monitoring target based DAMOS filter. Users can use it via writing 'target' to the filter's 'type' file and specifying the index of the target from the corresponding DAMON context's monitoring targets list to 'target_idx' sysfs file. Link: https://lkml.kernel.org/r/20230802214312.110532-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'mm') diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 03ddba3e216d..527e7d17eb3b 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -283,6 +283,7 @@ struct damon_sysfs_scheme_filter { bool matching; char *memcg_path; struct damon_addr_range addr_range; + int target_idx; }; static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) @@ -295,6 +296,7 @@ static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", "addr", + "target", }; static ssize_t type_show(struct kobject *kobj, @@ -413,6 +415,25 @@ static ssize_t addr_end_store(struct kobject *kobj, return err ? err : count; } +static ssize_t damon_target_idx_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%d\n", filter->target_idx); +} + +static ssize_t damon_target_idx_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoint(buf, 0, &filter->target_idx); + + return err ? err : count; +} + static void damon_sysfs_scheme_filter_release(struct kobject *kobj) { struct damon_sysfs_scheme_filter *filter = container_of(kobj, @@ -437,12 +458,16 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr = static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr = __ATTR_RW_MODE(addr_end, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = + __ATTR_RW_MODE(damon_target_idx, 0600); + static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, + &damon_sysfs_scheme_filter_damon_target_idx_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); @@ -1539,6 +1564,8 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, return -EINVAL; } filter->addr_range = sysfs_filter->addr_range; + } else if (filter->type == DAMOS_FILTER_TYPE_TARGET) { + filter->target_idx = sysfs_filter->target_idx; } damos_add_filter(scheme, filter); -- cgit v1.2.3 From 73d4719363371afdcd63143c3532fa6a9443de13 Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Thu, 3 Aug 2023 19:38:23 +0800 Subject: mm/z3fold: use helper function put_z3fold_locked() and put_z3fold_locked_list() This code is already duplicated six times, use helper function put_z3fold_locked() to release z3fold page instead of open code it to help improve code readability a bit. And add put_z3fold_locked_list() helper function to be consistent with it. No functional change involved. Link: https://lkml.kernel.org/r/20230803113824.886413-1-ruanjinjie@huawei.com Signed-off-by: Ruan Jinjie Reviewed-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/z3fold.c b/mm/z3fold.c index e84de91ecccb..7952adf9bede 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -480,6 +480,16 @@ static void release_z3fold_page_locked_list(struct kref *ref) __release_z3fold_page(zhdr, true); } +static inline int put_z3fold_locked(struct z3fold_header *zhdr) +{ + return kref_put(&zhdr->refcount, release_z3fold_page_locked); +} + +static inline int put_z3fold_locked_list(struct z3fold_header *zhdr) +{ + return kref_put(&zhdr->refcount, release_z3fold_page_locked_list); +} + static void free_pages_work(struct work_struct *w) { struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); @@ -666,7 +676,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) return new_zhdr; out_fail: - if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) { + if (new_zhdr && !put_z3fold_locked(new_zhdr)) { add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); } @@ -741,7 +751,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + if (put_z3fold_locked(zhdr)) return; if (test_bit(PAGE_STALE, &page->private) || @@ -752,7 +762,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { - if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + if (!put_z3fold_locked(zhdr)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } @@ -878,7 +888,7 @@ lookup: return zhdr; out_fail: - if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + if (!put_z3fold_locked(zhdr)) { add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } @@ -1012,8 +1022,7 @@ retry: if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { - if (!kref_put(&zhdr->refcount, - release_z3fold_page_locked)) + if (!put_z3fold_locked(zhdr)) z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); @@ -1129,7 +1138,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) if (!page_claimed) free_handle(handle, zhdr); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) + if (put_z3fold_locked_list(zhdr)) return; if (page_claimed) { /* the page has not been claimed by us */ @@ -1346,7 +1355,7 @@ static void z3fold_page_putback(struct page *page) if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); INIT_LIST_HEAD(&page->lru); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + if (put_z3fold_locked(zhdr)) return; if (list_empty(&zhdr->buddy)) add_to_unbuddied(pool, zhdr); -- cgit v1.2.3 From c1dc69e6ce65d95e3d4d080868c3007f1a6fc4fe Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 3 Aug 2023 19:49:34 +0800 Subject: mm/page_alloc: remove unneeded variable base Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations"), local variable base is just as same as order. So remove it. No functional change intended. Link: https://lkml.kernel.org/r/20230803114934.693989-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b17dcbb925d..94f9e159a18d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -538,8 +538,6 @@ out: static inline unsigned int order_to_pindex(int migratetype, int order) { - int base = order; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); @@ -549,7 +547,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif - return (MIGRATE_PCPTYPES * base) + migratetype; + return (MIGRATE_PCPTYPES * order) + migratetype; } static inline int pindex_to_order(unsigned int pindex) -- cgit v1.2.3 From 3a1060c2615874bd4d66d72dfdabbc48496ef040 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 3 Aug 2023 20:00:21 +0800 Subject: mm/memcg: fix wrong function name above obj_cgroup_charge_zswap() The correct function name is obj_cgroup_may_zswap(). Correct the comment. Link: https://lkml.kernel.org/r/20230803120021.762279-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Johannes Weiner Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56abc4f426f4..da9f983a090e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7777,7 +7777,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) * @objcg: the object cgroup * @size: size of compressed object * - * This forces the charge after obj_cgroup_may_swap() allowed + * This forces the charge after obj_cgroup_may_zswap() allowed * compression and storage in zwap for this cgroup to go ahead. */ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) -- cgit v1.2.3 From 16951789008dc0029b1e073fb1c20c1abb4c6504 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:48:58 +0800 Subject: mm/compaction: set compact_cached_free_pfn correctly in update_pageblock_skip Patch series "Fixes and cleanups to compaction", v2. This series contains random fixes and cleanups to free page isolation in compaction. This is based on another compact series[1]. More details can be found in respective patches. This patch (of 4): We will set skip to page block of block_start_pfn, it's more reasonable to set compact_cached_free_pfn to page block before the block_start_pfn. Link: https://lkml.kernel.org/r/20230803094901.2915942-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230803094901.2915942-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index c4d3a3129fd5..6e0c7456026b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1717,7 +1717,8 @@ static void isolate_freepages(struct compact_control *cc) /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) - update_pageblock_skip(cc, page, block_start_pfn); + update_pageblock_skip(cc, page, block_start_pfn - + pageblock_nr_pages); /* Are enough freepages isolated? */ if (cc->nr_freepages >= cc->nr_migratepages) { -- cgit v1.2.3 From a2864a67452ec6e378e57cbe151aad62ccdcc03f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:48:59 +0800 Subject: mm/compaction: merge end_pfn boundary check in isolate_freepages_range Merge the end_pfn boundary check for single page block forward and multiple page blocks forward to avoid do twice boundary check for multiple page blocks forward. Link: https://lkml.kernel.org/r/20230803094901.2915942-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 6e0c7456026b..d32929f39dc4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -740,8 +740,6 @@ isolate_freepages_range(struct compact_control *cc, /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; - block_end_pfn = min(block_end_pfn, end_pfn); - /* * pfn could pass the block_end_pfn if isolated freepage * is more than pageblock order. In this case, we adjust @@ -750,9 +748,10 @@ isolate_freepages_range(struct compact_control *cc, if (pfn >= block_end_pfn) { block_start_pfn = pageblock_start_pfn(pfn); block_end_pfn = pageblock_end_pfn(pfn); - block_end_pfn = min(block_end_pfn, end_pfn); } + block_end_pfn = min(block_end_pfn, end_pfn); + if (!pageblock_pfn_to_page(block_start_pfn, block_end_pfn, cc->zone)) break; -- cgit v1.2.3 From dc13292cccfd50916af00a471208fb48deb4d72f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:49:00 +0800 Subject: mm/compaction: remove unnecessary cursor page in isolate_freepages_block The cursor is only used for page forward currently. We can simply move page forward directly to remove unnecessary cursor. Link: https://lkml.kernel.org/r/20230803094901.2915942-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Kemeng Shi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d32929f39dc4..82fd543b410e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -589,7 +589,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *page; unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; @@ -599,12 +599,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (strict) stride = 1; - cursor = pfn_to_page(blockpfn); + page = pfn_to_page(blockpfn); /* Isolate free pages. */ - for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { + for (; blockpfn < end_pfn; blockpfn += stride, page += stride) { int isolated; - struct page *page = cursor; /* * Periodically drop the lock (if held) regardless of its @@ -629,7 +628,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (likely(order <= MAX_ORDER)) { blockpfn += (1UL << order) - 1; - cursor += (1UL << order) - 1; + page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } goto isolate_fail; @@ -666,7 +665,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, } /* Advance to the end of split page */ blockpfn += isolated - 1; - cursor += isolated - 1; + page += isolated - 1; continue; isolate_fail: -- cgit v1.2.3 From 13cfd63f3fec403ca8966079972aac4565fcf379 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:49:01 +0800 Subject: mm/compaction: remove unnecessary "else continue" at end of loop in isolate_freepages_block There is no behavior change to remove "else continue" code at end of scan loop. Just remove it to make code cleaner. Link: https://lkml.kernel.org/r/20230803094901.2915942-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Kemeng Shi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 82fd543b410e..dc16efd5fac5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -671,8 +671,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, isolate_fail: if (strict) break; - else - continue; } -- cgit v1.2.3 From f720b471fdb35619402293dcd421761fb1942e27 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 1 Aug 2023 10:31:44 +0800 Subject: mm: hugetlb: use flush_hugetlb_tlb_range() in move_hugetlb_page_tables() Archs may need to do special things when flushing hugepage tlb, so use the more applicable flush_hugetlb_tlb_range() instead of flush_tlb_range(). Link: https://lkml.kernel.org/r/20230801023145.17026-2-wangkefeng.wang@huawei.com Fixes: 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Kefeng Wang Reviewed-by: Mike Kravetz Acked-by: Muchun Song Cc: Barry Song <21cnbao@gmail.com> Cc: Catalin Marinas Cc: Joel Fernandes (Google) Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Mina Almasry Cc: Will Deacon Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 26e87d6cc92f..102f83bd3a9f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5279,9 +5279,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, } if (shared_pmd) - flush_tlb_range(vma, range.start, range.end); + flush_hugetlb_tlb_range(vma, range.start, range.end); else - flush_tlb_range(vma, old_end - len, old_end); + flush_hugetlb_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); -- cgit v1.2.3 From dbdd2a989f2357d40f0c5a440ca81bf1390f11ba Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Aug 2023 08:43:37 +0200 Subject: mm: no need to export mm_kobj There are no modules using mm_kobj, so do not export it. Link: https://lkml.kernel.org/r/2023080436-algebra-cabana-417d@gregkh Signed-off-by: Greg Kroah-Hartman Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mm_init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 641c56fd08a2..a2fbaa8d917f 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -154,7 +154,6 @@ early_param("mminit_loglevel", set_mminit_loglevel); #endif /* CONFIG_DEBUG_MEMORY_INIT */ struct kobject *mm_kobj; -EXPORT_SYMBOL_GPL(mm_kobj); #ifdef CONFIG_SMP s32 vm_committed_as_batch = 32; -- cgit v1.2.3 From 7c0a84bd0dc214a710305fbc0f407b8e7c410762 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:48 +0800 Subject: mm/compaction: correct last_migrated_pfn update in compact_zone We record start pfn of last isolated page block with last_migrated_pfn. And then: 1. We check if we mark the page block skip for exclusive access in isolate_migratepages_block by test if next migrate pfn is still in last isolated page block. If so, we will set finish_pageblock to do the rescan. 2. We check if a full cc->order block is scanned by test if last scan range passes the cc->order block boundary. If so, we flush the pages were freed. We treat cc->migrate_pfn before isolate_migratepages as the start pfn of last isolated page range. However, we always align migrate_pfn to page block or move to another page block in fast_find_migrateblock or in linearly scan forward in isolate_migratepages before do page isolation in isolate_migratepages_block. Update last_migrated_pfn with pageblock_start_pfn(cc->migrate_pfn - 1) after scan to correctly set start pfn of last isolated page range. To avoid that: 1. Miss a rescan with finish_pageblock set as last_migrate_pfn does not point to right pageblock and the migrate will not be in pageblock of last_migrate_pfn as it should be. 2. Wrongly issue flush by test cc->order block boundary with wrong last_migrate_pfn. Link: https://lkml.kernel.org/r/20230804110454.2935878-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index dc16efd5fac5..bc2eda71179a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2507,7 +2507,8 @@ rescan: goto check_drain; case ISOLATE_SUCCESS: update_cached = false; - last_migrated_pfn = iteration_start_pfn; + last_migrated_pfn = max(cc->zone->zone_start_pfn, + pageblock_start_pfn(cc->migrate_pfn - 1)); } err = migrate_pages(&cc->migratepages, compaction_alloc, -- cgit v1.2.3 From 7545e2f20aebf4da413be00384c4245eda5beb4d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:49 +0800 Subject: mm/compaction: skip page block marked skip in isolate_migratepages_block Move migrate_pfn to page block end when block is marked skip to avoid unnecessary scan retry of that block from upper caller. For example, compact_zone may wrongly rescan skip page block with finish_pageblock set as following: 1. cc->migrate point to the start of page block 2. compact_zone record last_migrated_pfn to cc->migrate 3. compact_zone->isolate_migratepages->isolate_migratepages_block tries to scan the block. The low_pfn maybe moved forward to middle of block because of free pages at beginning of block. 4. we find first lru page could be isolated but block was exclusive marked skip. 5. abort isolate_migratepages_block and make cc->migrate_pfn point to found lru page at middle of block. 6. compact_zone find cc->migrate_pfn and last_migrated_pfn are in the same block and wrongly rescan the block with finish_pageblock set. Link: https://lkml.kernel.org/r/20230804110454.2935878-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index bc2eda71179a..78826c433ef6 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1140,6 +1140,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, skip_updated = true; if (test_and_set_skip(cc, valid_page) && !cc->finish_pageblock) { + low_pfn = end_pfn; goto isolate_abort; } } -- cgit v1.2.3 From 0aa8ea3c5d353d5f0aa1e607f8dc5f43bf6cdf05 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:50 +0800 Subject: mm/compaction: correct comment of fast_find_migrateblock in isolate_migratepages After 90ed667c03fe5 ("Revert "Revert "mm/compaction: fix set skip in fast_find_migrateblock"""), we remove skip set in fast_find_migrateblock. Correct comment that fast_find_block is used to avoid isolation_suitable check for pageblock returned from fast_find_migrateblock because fast_find_migrateblock will mark found pageblock skipped. Instead, comment that fast_find_block is used to avoid a redundant check of fast found pageblock which is already checked skip flag inside fast_find_migrateblock. Link: https://lkml.kernel.org/r/20230804110454.2935878-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 78826c433ef6..3b204fbaa470 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1987,9 +1987,9 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) block_start_pfn = cc->zone->zone_start_pfn; /* - * fast_find_migrateblock marks a pageblock skipped so to avoid - * the isolation_suitable check below, check whether the fast - * search was successful. + * fast_find_migrateblock() has already ensured the pageblock is not + * set with a skipped flag, so to avoid the isolation_suitable check + * below again, check whether the fast search was successful. */ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; -- cgit v1.2.3 From cf043a007e00ae7fe5a4aa5447068fcd13ce031b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:51 +0800 Subject: mm/compaction: correct comment of cached migrate pfn update Commit e380bebe47715 ("mm, compaction: keep migration source private to a single compaction instance") moved update of async and sync compact_cached_migrate_pfn from update_pageblock_skip to update_cached_migrate but left the comment behind. Move the relevant comment to correct this. Link: https://lkml.kernel.org/r/20230804110454.2935878-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 3b204fbaa470..db44319dc716 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -469,6 +469,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) pfn = pageblock_end_pfn(pfn); + /* Update where async and sync compaction should restart */ if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && @@ -490,7 +491,6 @@ static void update_pageblock_skip(struct compact_control *cc, set_pageblock_skip(page); - /* Update where async and sync compaction should restart */ if (pfn < zone->compact_cached_free_pfn) zone->compact_cached_free_pfn = pfn; } -- cgit v1.2.3 From c3750cc7725af8da06f2f36ddce7adc52a3a51d6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:52 +0800 Subject: mm/compaction: correct comment to complete migration failure Commit cfccd2e63e7e0 ("mm, compaction: finish pageblocks on complete migration failure") convert cc->order aligned check to page block order aligned check. Correct comment relevant with it. Link: https://lkml.kernel.org/r/20230804110454.2935878-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index db44319dc716..d0d3fea64b40 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2532,7 +2532,7 @@ rescan: } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block and + * within the pageblock_order-aligned block and * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near -- cgit v1.2.3 From f82024cbfa3a410d947b588658949a8a391da8a7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:53 +0800 Subject: mm/compaction: remove unnecessary return for void function Remove unnecessary return for void function Link: https://lkml.kernel.org/r/20230804110454.2935878-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d0d3fea64b40..91a9dfa41ef4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1444,8 +1444,6 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn) set_pageblock_skip(page); - - return; } /* Search orders in round-robin fashion */ @@ -2898,7 +2896,7 @@ int compaction_register_node(struct node *node) void compaction_unregister_node(struct node *node) { - return device_remove_file(&node->dev, &dev_attr_compact); + device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ -- cgit v1.2.3 From 18c59d58baa60a8bfaec58d29b6b94877664eed8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:54 +0800 Subject: mm/compaction: only set skip flag if cc->no_set_skip_hint is false Keep the same logic as update_pageblock_skip, only set skip if no_set_skip_hint is false which is more reasonable. Link: https://lkml.kernel.org/r/20230804110454.2935878-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Cc: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 91a9dfa41ef4..fe7b4e7c5d24 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1442,7 +1442,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ - if (start_pfn == end_pfn) + if (start_pfn == end_pfn && !cc->no_set_skip_hint) set_pageblock_skip(page); } -- cgit v1.2.3 From 0db31d63f27e5b8ca84b9fd5a3cff5b12ac88abf Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 2 Aug 2023 15:23:28 +0800 Subject: mm: disable kernelcore=mirror when no mirror memory For system with kernelcore=mirror enabled while no mirrored memory is reported by efi. This could lead to kernel OOM during startup since all memory beside zone DMA are in the movable zone and this prevents the kernel to use it. Zone DMA/DMA32 initialization is independent of mirrored memory and their max pfn is set in zone_sizes_init(). Since kernel can fallback to zone DMA/DMA32 if there is no memory in zone Normal, these zones are seen as mirrored memory no mather their memory attributes are. To solve this problem, disable kernelcore=mirror when there is no real mirrored memory exists. Link: https://lkml.kernel.org/r/20230802072328.2107981-1-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Suggested-by: Kefeng Wang Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Kefeng Wang Cc: Levi Yun Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/memblock.c | 5 +++++ mm/mm_init.c | 5 +++++ 3 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 5a03bc4782a2..a037b1b37f6d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1022,6 +1022,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } extern bool mirrored_kernelcore; +extern bool memblock_has_mirror(void); static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { diff --git a/mm/memblock.c b/mm/memblock.c index f9e61e565a53..913b2520a9a0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -161,6 +161,11 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock; static int memblock_reserved_in_slab __initdata_memblock; +bool __init_memblock memblock_has_mirror(void) +{ + return system_has_some_mirror; +} + static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; diff --git a/mm/mm_init.c b/mm/mm_init.c index a2fbaa8d917f..2a19f3151661 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -376,6 +376,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + if (!memblock_has_mirror()) { + pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); + goto out; + } + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; -- cgit v1.2.3 From 61f297380118060a70888e0c1f5c534b74ab78fe Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:53 +0800 Subject: mm: remove redundant K() macro definition Patch series "cleanup with helper macro K()". Use helper macro K() to improve code readability. No functional modification involved. Remove redundant K() macro definition. This patch (of 7): Since commit eb8589b4f8c1 ("mm: move mem_init_print_info() to mm_init.c"), the K() macro definition has been moved to mm/internal.h. Therefore, the definitions in mm/memcontrol.c, mm/backing-dev.c and mm/oom_kill.c are redundant. Drop redundant definitions. [akpm@linux-foundation.org: oom_kill.c: remove "#undef K", per Kefeng] Link: https://lkml.kernel.org/r/20230804012559.2617515-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230804012559.2617515-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/backing-dev.c | 3 +-- mm/memcontrol.c | 1 - mm/oom_kill.c | 3 --- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3ffc3cfa7a14..fc44bfbf785e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -16,6 +16,7 @@ #include #include #include +#include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); @@ -34,8 +35,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -#define K(x) ((x) << (PAGE_SHIFT - 10)) - #ifdef CONFIG_DEBUG_FS #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da9f983a090e..d83f99580900 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1629,7 +1629,6 @@ static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) WARN_ON_ONCE(seq_buf_has_overflowed(s)); } -#define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_context: Print OOM information relevant to * memory controller. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 612b5597d3af..44bde56ecd02 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -479,8 +479,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); static bool oom_killer_disabled __read_mostly; -#define K(x) ((x) << (PAGE_SHIFT-10)) - /* * task->mm can be NULL if the task is the exited group leader. So to * determine whether the task is using a particular mm, we examine all the @@ -994,7 +992,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mmdrop(mm); put_task_struct(victim); } -#undef K /* * Kill provided task unless it's secured by setting -- cgit v1.2.3 From 00cde0429bc50792ed8786e909e70141acf3741a Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:54 +0800 Subject: mm/swapfile.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swapfile.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index e04eb9c0482d..b52145c6bac2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, @@ -2635,8 +2636,8 @@ static int swap_show(struct seq_file *swap, void *v) return 0; } - bytes = si->pages << (PAGE_SHIFT - 10); - inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); + bytes = K(si->pages); + inuse = K(READ_ONCE(si->inuse_pages)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -2861,8 +2862,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", - maxpages << (PAGE_SHIFT - 10), - last_page << (PAGE_SHIFT - 10)); + K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; @@ -3184,8 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, cluster_info); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - p->pages<<(PAGE_SHIFT-10), name->name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + K(p->pages), name->name, p->prio, nr_extents, + K((unsigned long long)span), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", -- cgit v1.2.3 From 3cb8eaa4558e4d1cde9641e1cb0dcbbd74ae5723 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:55 +0800 Subject: mm/swap_state.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swap_state.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/swap_state.c b/mm/swap_state.c index f8ea7015bad4..d157862ba0a6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,9 +63,8 @@ static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); - printk("Free swap = %ldkB\n", - get_nr_swap_pages() << (PAGE_SHIFT - 10)); - printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); + printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) -- cgit v1.2.3 From b91742d84d29c39b643992b95560cfb7337eab18 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:56 +0800 Subject: mm/shmem.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-5-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 235f2b2fd202..20daa207d8bf 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3864,8 +3864,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) - seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_SHIFT - 10)); + seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) -- cgit v1.2.3 From d5a6474d3d36623b368c19dd3e9f5a09c6013120 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:57 +0800 Subject: mm/nommu.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-6-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 1fe0ee239860..8dba41cfc44d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1800,7 +1800,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1821,7 +1821,7 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; -- cgit v1.2.3 From b1773e0ea30a46cf23238ff38d5ca4756c41ad2e Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:58 +0800 Subject: mm/mmap.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-7-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bc91d91261ab..35b6bc9c7c95 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3785,7 +3785,7 @@ static int init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -3806,7 +3806,7 @@ static int init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; @@ -3850,7 +3850,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, break; case MEM_OFFLINE: - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); -- cgit v1.2.3 From 6c1aa2d37f7677609c74a4ff120f99a07b90ba08 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:59 +0800 Subject: mm/hugetlb.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-8-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 102f83bd3a9f..851457af0869 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4748,7 +4748,7 @@ void hugetlb_show_meminfo_node(int nid) void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", - atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); + K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ -- cgit v1.2.3 From e727bfd5e73a35ecbc4a01a15c659b9fafaa97c0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:21 -0700 Subject: mm: replace mmap with vma write lock assertions when operating on a vma Vma write lock assertion always includes mmap write lock assertion and additional vma lock checks when per-VMA locks are enabled. Replace weaker mmap_assert_write_locked() assertions with stronger vma_assert_write_locked() ones when we are operating on a vma which is expected to be locked. Link: https://lkml.kernel.org/r/20230804152724.3090321-4-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- mm/memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 851457af0869..abfdcaf114f1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5029,7 +5029,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); - mmap_assert_write_locked(src); + vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* diff --git a/mm/memory.c b/mm/memory.c index 1113ee625a94..039dcbbcc7d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ - mmap_assert_write_locked(src_mm); + vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } -- cgit v1.2.3 From 60081bf19b0ec8fa40c589bd361fa2bc763f1050 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:22 -0700 Subject: mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is not obvious and makes it hard to understand where vma locking is happening. Also in some cases (like in dup_userfaultfd()) vma should be locked earlier than vma_flags modification. To make locking more visible, change these functions to assert that the vma write lock is taken and explicitly lock the vma beforehand. Fix userfaultfd functions which should lock the vma earlier. Link: https://lkml.kernel.org/r/20230804152724.3090321-5-surenb@google.com Suggested-by: Linus Torvalds Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + fs/userfaultfd.c | 6 ++++++ include/linux/mm.h | 10 +++++++--- mm/madvise.c | 5 ++--- mm/mlock.c | 3 ++- mm/mprotect.c | 1 + 6 files changed, 19 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 709ebd578394..e2d6f9327f77 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + vma_start_write(vma); /* Copy vm_flags to avoid partial modifications in ksm_madvise */ vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 9854d44ae18e..70bd2951b68d 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1511,6 +1515,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1694,6 +1699,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; diff --git a/include/linux/mm.h b/include/linux/mm.h index 49eafc62b4e6..5a6ff9140090 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -774,18 +774,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) = flags; } -/* Use when VMA is part of the VMA tree and modifications need coordination */ +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } diff --git a/mm/madvise.c b/mm/madvise.c index da65f8bd9ac3..8498f700c284 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } success: - /* - * vm_flags is protected by the mmap_lock held in write mode. - */ + /* vm_flags is protected by the mmap_lock held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index 0a0c996c5c21..1746600a2e14 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -386,6 +386,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; + vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); @@ -460,9 +461,9 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ + vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); diff --git a/mm/mprotect.c b/mm/mprotect.c index 3f36c88a238e..7cd7f644da80 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -656,6 +656,7 @@ success: * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; -- cgit v1.2.3 From ad9f006351c3368171458ae7ab14d72f031b239f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:23 -0700 Subject: mm: always lock new vma before inserting into vma tree While it's not strictly necessary to lock a newly created vma before adding it into the vma tree (as long as no further changes are performed to it), it seems like a good policy to lock it and prevent accidental changes after it becomes visible to the page faults. Lock the vma before adding it into the vma tree. [akpm@linux-foundation.org: fix reject fixing in vma_link(), per Jann] Link: https://lkml.kernel.org/r/20230804152724.3090321-6-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 35b6bc9c7c95..ef584aca1cd3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -401,6 +401,8 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; + vma_start_write(vma); + vma_iter_store(&vmi, vma); if (vma->vm_file) { @@ -463,7 +465,8 @@ static inline void vma_prepare(struct vma_prepare *vp) vma_start_write(vp->vma); if (vp->adj_next) vma_start_write(vp->adj_next); - /* vp->insert is always a newly created VMA, no need for locking */ + if (vp->insert) + vma_start_write(vp->insert); if (vp->remove) vma_start_write(vp->remove); if (vp->remove2) @@ -3093,6 +3096,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_pgoff = addr >> PAGE_SHIFT; vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -3341,7 +3345,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_start_write(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; -- cgit v1.2.3 From c9d6e982c3f8703c24f488d3de15e0ee97f4655e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:24 -0700 Subject: mm: move vma locking out of vma_prepare and dup_anon_vma vma_prepare() is currently the central place where vmas are being locked before vma_complete() applies changes to them. While this is convenient, it also obscures vma locking and makes it harder to follow the locking rules. Move vma locking out of vma_prepare() and take vma locks explicitly at the locations where vmas are being modified. Move vma locking and replace it with an assertion inside dup_anon_vma() to further clarify the locking pattern inside vma_merge(). Link: https://lkml.kernel.org/r/20230804152724.3090321-7-surenb@google.com Suggested-by: Linus Torvalds Suggested-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mmap.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index ef584aca1cd3..514ced13c65c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -462,16 +462,6 @@ static inline void init_vma_prep(struct vma_prepare *vp, */ static inline void vma_prepare(struct vma_prepare *vp) { - vma_start_write(vp->vma); - if (vp->adj_next) - vma_start_write(vp->adj_next); - if (vp->insert) - vma_start_write(vp->insert); - if (vp->remove) - vma_start_write(vp->remove); - if (vp->remove2) - vma_start_write(vp->remove2); - if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); @@ -605,7 +595,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { - vma_start_write(dst); + vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } @@ -637,10 +627,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, bool remove_next = false; struct vma_prepare vp; + vma_start_write(vma); if (next && (vma != next) && (end == next->vm_end)) { int ret; remove_next = true; + vma_start_write(next); ret = dup_anon_vma(vma, next); if (ret) return ret; @@ -696,6 +688,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; + vma_start_write(vma); + init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); @@ -921,16 +915,21 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!merge_prev && !merge_next) return NULL; /* Not mergeable. */ + if (merge_prev) + vma_start_write(prev); + res = vma = prev; remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + vma_start_write(next); remove = next; /* case 1 */ vma_end = next->vm_end; err = dup_anon_vma(prev, next); if (curr) { /* case 6 */ + vma_start_write(curr); remove = curr; remove2 = next; if (!next->anon_vma) @@ -938,6 +937,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } else if (merge_prev) { /* case 2 */ if (curr) { + vma_start_write(curr); err = dup_anon_vma(prev, curr); if (end == curr->vm_end) { /* case 7 */ remove = curr; @@ -947,8 +947,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } } else { /* merge_next */ + vma_start_write(next); res = next; if (prev && addr < prev->vm_end) { /* case 4 */ + vma_start_write(prev); vma_end = addr; adjust = next; adj_start = -(prev->vm_end - addr); @@ -964,6 +966,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_pgoff = next->vm_pgoff - pglen; if (curr) { /* case 8 */ vma_pgoff = curr->vm_pgoff; + vma_start_write(curr); remove = curr; err = dup_anon_vma(next, curr); } @@ -2366,6 +2369,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); + vma_start_write(vma); + vma_start_write(new); + init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); @@ -3071,6 +3077,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi, vma)) goto unacct_fail; + vma_start_write(vma); + init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); -- cgit v1.2.3 From 9a9d0b829901125553c36b9512b2a5da4505be31 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 7 Aug 2023 01:16:11 +0200 Subject: mm: move dummy_vm_ops out of a header Otherwise the kernel ends up with multiple copies: $ nm vmlinux | grep dummy_vm_ops ffffffff81e4ea00 d dummy_vm_ops.2 ffffffff81e11760 d dummy_vm_ops.254 ffffffff81e406e0 d dummy_vm_ops.4 ffffffff81e3c780 d dummy_vm_ops.7 While here prefix it with vma_. Link: https://lkml.kernel.org/r/20230806231611.1395735-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/init-mm.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a6ff9140090..c63ec57a54dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -751,17 +751,17 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #endif /* CONFIG_PER_VMA_LOCK */ +extern const struct vm_operations_struct vma_dummy_vm_ops; + /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { - static const struct vm_operations_struct dummy_vm_ops = {}; - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; - vma->vm_ops = &dummy_vm_ops; + vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); diff --git a/mm/init-mm.c b/mm/init-mm.c index efa97b57acfd..cfd367822cdd 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -17,6 +17,8 @@ #define INIT_MM_CONTEXT(name) #endif +const struct vm_operations_struct vma_dummy_vm_ops; + /* * For dynamically allocated mm_structs, there is a dynamically sized cpumask * at the end of the structure, the size of which depends on the maximum CPU -- cgit v1.2.3 From 6379693e3c2683a7c86f395e878534731ac7ed06 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 7 Aug 2023 19:41:25 +0800 Subject: mm: memory-failure: use helper macro llist_for_each_entry_safe() It's more convenient to use helper macro llist_for_each_entry_safe(). No functional change intended. Link: https://lkml.kernel.org/r/20230807114125.3440802-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index adb0dacbc74e..976747d28ce7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1868,13 +1868,12 @@ bool is_raw_hwpoison_page_in_hugepage(struct page *page) static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { - struct llist_node *t, *tnode, *head; + struct llist_node *head; + struct raw_hwp_page *p, *next; unsigned long count = 0; head = llist_del_all(raw_hwp_list_head(folio)); - llist_for_each_safe(tnode, t, head) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + llist_for_each_entry_safe(p, next, head, node) { if (move_flag) SetPageHWPoison(p->page); else @@ -1889,7 +1888,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; - struct llist_node *t, *tnode; + struct raw_hwp_page *p, *next; int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* @@ -1900,9 +1899,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + llist_for_each_entry_safe(p, next, head->first, node) { if (p->page == page) return -EHWPOISON; } -- cgit v1.2.3 From daee07bfba3340b07edcf9ae92044398e8a964db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 7 Aug 2023 10:35:28 +0800 Subject: mm/mm_init: use helper macro BITS_PER_LONG and BITS_PER_BYTE It's more readable to use helper macro BITS_PER_LONG and BITS_PER_BYTE. No functional change intended. Link: https://lkml.kernel.org/r/20230807023528.325191-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mm_init.c b/mm/mm_init.c index 2a19f3151661..50f2f34745af 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -79,7 +79,7 @@ void __init mminit_verify_pageflags_layout(void) int shift, width; unsigned long or_mask, add_mask; - shift = 8 * sizeof(unsigned long); + shift = BITS_PER_LONG; width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", @@ -1426,9 +1426,9 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; - usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + usemapsize = roundup(usemapsize, BITS_PER_LONG); - return usemapsize / 8; + return usemapsize / BITS_PER_BYTE; } static void __ref setup_usemap(struct zone *zone) -- cgit v1.2.3 From 3f32c49ed6f15c8412a8abc93a92c4b37e6c4592 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 8 Aug 2023 11:33:59 +0800 Subject: mm: memtest: convert to memtest_report_meminfo() It is better to not expose too many internal variables of memtest, add a helper memtest_report_meminfo() to show memtest results. Link: https://lkml.kernel.org/r/20230808033359.174986-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport (IBM) Cc: Matthew Wilcox Cc: Tomas Mudrunka Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 12 +----------- include/linux/memblock.h | 10 ++++------ mm/memtest.c | 22 ++++++++++++++++++++-- 3 files changed, 25 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 74e3c3815696..45af9a989d40 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -133,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); -#ifdef CONFIG_MEMTEST - if (early_memtest_done) { - unsigned long early_memtest_bad_size_kb; - - early_memtest_bad_size_kb = early_memtest_bad_size>>10; - if (early_memtest_bad_size && !early_memtest_bad_size_kb) - early_memtest_bad_size_kb = 1; - /* When 0 is reported, it means there actually was a successful test */ - seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); - } -#endif + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 0d031fbfea25..1c1072e3ca06 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -594,13 +594,11 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif #ifdef CONFIG_MEMTEST -extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */ -extern bool early_memtest_done; /* Was early memtest done? */ -extern void early_memtest(phys_addr_t start, phys_addr_t end); +void early_memtest(phys_addr_t start, phys_addr_t end); +void memtest_report_meminfo(struct seq_file *m); #else -static inline void early_memtest(phys_addr_t start, phys_addr_t end) -{ -} +static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } +static inline void memtest_report_meminfo(struct seq_file *m) { } #endif diff --git a/mm/memtest.c b/mm/memtest.c index 57149dfee438..32f3e9dda837 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -3,9 +3,10 @@ #include #include #include +#include -bool early_memtest_done; -phys_addr_t early_memtest_bad_size; +static bool early_memtest_done; +static phys_addr_t early_memtest_bad_size; static u64 patterns[] __initdata = { /* The first entry has to be 0 to leave memtest with zeroed memory */ @@ -117,3 +118,20 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end) do_one_pass(patterns[idx], start, end); } } + +void memtest_report_meminfo(struct seq_file *m) +{ + unsigned long early_memtest_bad_size_kb; + + if (!IS_ENABLED(CONFIG_PROC_FS)) + return; + + if (!early_memtest_done) + return; + + early_memtest_bad_size_kb = early_memtest_bad_size >> 10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); +} -- cgit v1.2.3 From 97157d8908bc10dec22cc4479f4c5cc7db58a12c Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 8 Aug 2023 06:20:56 +0000 Subject: mm: zswap: update comment for struct zswap_entry Since commit 0bb488498c98 ("mm: zswap: remove zswap_header"), the 'offset' has been replaced by swpentry, update the comment for it, and also add comment for 'objcg'. Link: https://lkml.kernel.org/r/20230808062056.292950-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Reviewed-by: Yosry Ahmed Acked-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 8b6b1bc8a5f2..7300b98d4a03 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -182,7 +182,7 @@ struct zswap_pool { * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type - * offset - the swap offset for the entry. Index into the red-black tree. + * swpentry - associated swap entry, the offset indexes into the red-black tree * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock @@ -195,6 +195,7 @@ struct zswap_pool { * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content + * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { -- cgit v1.2.3 From 9af7c7426c2e49bad77cf7494fea85a773d1ded6 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Tue, 8 Aug 2023 16:44:32 +0800 Subject: writeback: remove redundant checks for root memcg The check for root memcg will be done in wb_get_lookup(), so remove the redundant one to simplify the code. Link: https://lkml.kernel.org/r/20230808084431.1632934-1-alexjlzheng@tencent.com Signed-off-by: Jinliang Zheng Signed-off-by: Andrew Morton --- mm/backing-dev.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index fc44bfbf785e..1e3447bccdb1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -732,9 +732,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, might_alloc(gfp); - if (!memcg_css->parent) - return &bdi->wb; - do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); -- cgit v1.2.3 From 04d5ea46a15149a12f79c686b6a1ffc9c3233272 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:56 +0530 Subject: mm/memory_hotplug: simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig Patch series "Add support for memmap on memory feature on ppc64", v8. This patch series update memmap on memory feature to fall back to memmap allocation outside the memory block if the alignment rules are not met. This makes the feature more useful on architectures like ppc64 where alignment rules are different with 64K page size. This patch (of 6): Instead of adding menu entry with all supported architectures, add mm/Kconfig variable and select the same from supported architectures. No functional change in this patch. Link: https://lkml.kernel.org/r/20230808091501.287660-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230808091501.287660-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 751d8c8821db..a7a88322bf46 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -78,6 +78,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY select ARCH_USE_MEMTEST @@ -349,9 +350,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - config SMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78224aa76409..d0258e92a8af 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/mm/Kconfig b/mm/Kconfig index 5fe49c030961..721dc88423c7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY endif # MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + bool + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. -- cgit v1.2.3 From e3c2bfdd33a30b34674fb8839f5476ab2702c1c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:57 +0530 Subject: mm/memory_hotplug: allow memmap on memory hotplug request to fallback If not supported, fallback to not using memap on memmory. This avoids the need for callers to do the fallback. Link: https://lkml.kernel.org/r/20230808091501.287660-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/acpi/acpi_memhotplug.c | 3 +-- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 13 ++++++------- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 24f662d8bd39..d0c1a71007d0 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (!info->length) continue; - if (mhp_supports_memmap_on_memory(info->length)) - mhp_flags |= MHP_MEMMAP_ON_MEMORY; + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(mgid, info->start_addr, info->length, mhp_flags); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 013c69753c91..7d2076583494 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -97,6 +97,8 @@ typedef int __bitwise mhp_t; * To do so, we will use the beginning of the hot-added range to build * the page tables for the memmap array that describes the entire range. * Only selected architectures support it with SPARSE_VMEMMAP. + * This is only a hint, the core kernel can decide to not do this based on + * different alignment checks. */ #define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) /* @@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7cfd13c91568..eca32ccd45cc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -bool mhp_supports_memmap_on_memory(unsigned long size) +static bool mhp_supports_memmap_on_memory(unsigned long size) { unsigned long nr_vmemmap_pages = size / PAGE_SIZE; unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); @@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { - if (!mhp_supports_memmap_on_memory(size)) { - ret = -EINVAL; - goto error; + if (mhp_supports_memmap_on_memory(size)) { + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; } - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); - params.altmap = &mhp_altmap; + /* fallback to not using altmap */ } /* call arch's memory hotadd */ -- cgit v1.2.3 From 85a2b4b08f202d67be81e2453064e01572ec19c8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:58 +0530 Subject: mm/memory_hotplug: allow architecture to override memmap on memory support check Some architectures would want different restrictions. Hence add an architecture-specific override. The PMD_SIZE check is moved there. Link: https://lkml.kernel.org/r/20230808091501.287660-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index eca32ccd45cc..746cb7c08c64 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,10 +1247,26 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +static inline unsigned long memory_block_memmap_size(void) +{ + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); +} + +#ifndef arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + /* + * As default, we want the vmemmap to span a complete PMD such that we + * can map the vmemmap using a single PMD if supported by the + * architecture. + */ + return IS_ALIGNED(vmemmap_size, PMD_SIZE); +} +#endif + static bool mhp_supports_memmap_on_memory(unsigned long size) { - unsigned long nr_vmemmap_pages = size / PAGE_SIZE; - unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long remaining_size = size - vmemmap_size; /* @@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) */ return mhp_memmap_on_memory() && size == memory_block_size_bytes() && - IS_ALIGNED(vmemmap_size, PMD_SIZE) && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && + arch_supports_memmap_on_memory(vmemmap_size); } /* -- cgit v1.2.3 From 2d1f649c7c0855751c7ff43f4e34784061bc72f7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:59 +0530 Subject: mm/memory_hotplug: support memmap_on_memory when memmap is not aligned to pageblocks Currently, memmap_on_memory feature is only supported with memory block sizes that result in vmemmap pages covering full page blocks. This is because memory onlining/offlining code requires applicable ranges to be pageblock-aligned, for example, to set the migratetypes properly. This patch helps to lift that restriction by reserving more pages than required for vmemmap space. This helps the start address to be page block aligned with different memory block sizes. Using this facility implies the kernel will be reserving some pages for every memoryblock. This allows the memmap on memory feature to be widely useful with different memory block size values. For ex: with 64K page size and 256MiB memory block size, we require 4 pages to map vmemmap pages, To align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Link: https://lkml.kernel.org/r/20230808091501.287660-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 12 +++ mm/memory_hotplug.c | 120 ++++++++++++++++++++---- 2 files changed, 113 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index bd77841041af..2994958c7ce8 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -433,6 +433,18 @@ The following module parameters are currently defined: memory in a way that huge pages in bigger granularity cannot be formed on hotplugged memory. + + With value "force" it could result in memory + wastage due to memmap size limitations. For + example, if the memmap for a memory block + requires 1 MiB, but the pageblock size is 2 + MiB, 1 MiB of hotplugged memory will be wasted. + Note that there are still cases where the + feature cannot be enforced: for example, if the + memmap is smaller than a single page, or if the + architecture does not support the forced mode + in all configurations. + ``online_policy`` read-write: Set the basic policy used for automatic zone selection when onlining memory blocks without specifying a target zone. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 746cb7c08c64..76b813991bdc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -41,17 +41,83 @@ #include "internal.h" #include "shuffle.h" +enum { + MEMMAP_ON_MEMORY_DISABLE = 0, + MEMMAP_ON_MEMORY_ENABLE, + MEMMAP_ON_MEMORY_FORCE, +}; + +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; + +static inline unsigned long memory_block_memmap_size(void) +{ + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); +} + +static inline unsigned long memory_block_memmap_on_memory_pages(void) +{ + unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); + + /* + * In "forced" memmap_on_memory mode, we add extra pages to align the + * vmemmap size to cover full pageblocks. That way, we can add memory + * even if the vmemmap size is not properly aligned, however, we might waste + * memory. + */ + if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) + return pageblock_align(nr_pages); + return nr_pages; +} + #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ -static bool memmap_on_memory __ro_after_init; -module_param(memmap_on_memory, bool, 0444); -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +static int set_memmap_mode(const char *val, const struct kernel_param *kp) +{ + int ret, mode; + bool enabled; + + if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { + mode = MEMMAP_ON_MEMORY_FORCE; + } else { + ret = kstrtobool(val, &enabled); + if (ret < 0) + return ret; + if (enabled) + mode = MEMMAP_ON_MEMORY_ENABLE; + else + mode = MEMMAP_ON_MEMORY_DISABLE; + } + *((int *)kp->arg) = mode; + if (mode == MEMMAP_ON_MEMORY_FORCE) { + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); + + pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", + memmap_pages - PFN_UP(memory_block_memmap_size())); + } + return 0; +} + +static int get_memmap_mode(char *buffer, const struct kernel_param *kp) +{ + if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE) + return sprintf(buffer, "force\n"); + return param_get_bool(buffer, kp); +} + +static const struct kernel_param_ops memmap_mode_ops = { + .set = set_memmap_mode, + .get = get_memmap_mode, +}; +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" + "With value \"force\" it could result in memory wastage due " + "to memmap size limitations (Y/N/force)"); static inline bool mhp_memmap_on_memory(void) { - return memmap_on_memory; + return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; } #else static inline bool mhp_memmap_on_memory(void) @@ -1247,11 +1313,6 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -static inline unsigned long memory_block_memmap_size(void) -{ - return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); -} - #ifndef arch_supports_memmap_on_memory static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) { @@ -1267,7 +1328,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) static bool mhp_supports_memmap_on_memory(unsigned long size) { unsigned long vmemmap_size = memory_block_memmap_size(); - unsigned long remaining_size = size - vmemmap_size; + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); /* * Besides having arch support and the feature enabled at runtime, we @@ -1295,10 +1356,28 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - return mhp_memmap_on_memory() && - size == memory_block_size_bytes() && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && - arch_supports_memmap_on_memory(vmemmap_size); + if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + return false; + + /* + * Make sure the vmemmap allocation is fully contained + * so that we always allocate vmemmap memory from altmap area. + */ + if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) + return false; + + /* + * start pfn should be pageblock_nr_pages aligned for correctly + * setting migrate types + */ + if (!pageblock_aligned(memmap_pages)) + return false; + + if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) + /* No effective hotplugged memory doesn't make sense. */ + return false; + + return arch_supports_memmap_on_memory(vmemmap_size); } /* @@ -1311,7 +1390,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; - struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap mhp_altmap = { + .base_pfn = PHYS_PFN(res->start), + .end_pfn = PHYS_PFN(res->end), + }; struct memory_group *group = NULL; u64 start, size; bool new_node = false; @@ -1356,8 +1438,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { if (mhp_supports_memmap_on_memory(size)) { - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); + mhp_altmap.free = memory_block_memmap_on_memory_pages(); params.altmap = &mhp_altmap; } /* fallback to not using altmap */ @@ -1369,8 +1450,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc, - group); + ret = create_memory_block_devices(start, size, mhp_altmap.free, group); if (ret) { arch_remove_memory(start, size, NULL); goto error; @@ -2096,6 +2176,8 @@ static int __ref try_remove_memory(u64 start, u64 size) * right thing if we used vmem_altmap when hot-adding * the range. */ + mhp_altmap.base_pfn = PHYS_PFN(start); + mhp_altmap.free = nr_vmemmap_pages; mhp_altmap.alloc = nr_vmemmap_pages; altmap = &mhp_altmap; } -- cgit v1.2.3 From 1a8c64e110435e44e71bcd50a75663174b575f22 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:01 +0530 Subject: mm/memory_hotplug: embed vmem_altmap details in memory block With memmap on memory, some architecture needs more details w.r.t altmap such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of computing them again when we remove a memory block, embed vmem_altmap details in struct memory_block if we are using memmap on memory block feature. [yangyingliang@huawei.com: fix error return code in add_memory_resource()] Link: https://lkml.kernel.org/r/20230809081552.1351184-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20230808091501.287660-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Yang Yingliang Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/base/memory.c | 27 +++++++++++++++--------- include/linux/memory.h | 8 ++------ mm/memory_hotplug.c | 56 +++++++++++++++++++++++++++++++------------------- 3 files changed, 54 insertions(+), 37 deletions(-) (limited to 'mm') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b456ac213610..8191709c9ad2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -105,7 +105,8 @@ EXPORT_SYMBOL(unregister_memory_notifier); static void memory_block_release(struct device *dev) { struct memory_block *mem = to_memory_block(dev); - + /* Verify that the altmap is freed */ + WARN_ON(mem->altmap); kfree(mem); } @@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; struct zone *zone; int ret; @@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem) * stage helps to keep accounting easier to follow - e.g vmemmaps * belong to the same zone as the memory they backed. */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->free; + if (nr_vmemmap_pages) { ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); if (ret) @@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; int ret; if (!mem->zone) @@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem) * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->free; + if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); @@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid, #endif static int add_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { struct memory_block *mem; @@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, unsigned long state, mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; - mem->nr_vmemmap_pages = nr_vmemmap_pages; + mem->altmap = altmap; INIT_LIST_HEAD(&mem->group_next); #ifndef CONFIG_NUMA @@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return add_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0, NULL); + MEM_ONLINE, NULL, NULL); } static int add_hotplug_memory_block(unsigned long block_id, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { - return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); + return add_memory_block(block_id, MEM_OFFLINE, altmap, group); } static void remove_memory_block(struct memory_block *memory) @@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_hotplug_memory_block(block_id, vmemmap_pages, group); + ret = add_hotplug_memory_block(block_id, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 31343566c221..f53cfdaaaa41 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -77,11 +77,7 @@ struct memory_block { */ struct zone *zone; struct device dev; - /* - * Number of vmemmap pages. These pages - * lay at the beginning of the memory block. - */ - unsigned long nr_vmemmap_pages; + struct vmem_altmap *altmap; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) @@ -147,7 +143,7 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 76b813991bdc..1b03f4ec6fd2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1439,7 +1439,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { if (mhp_supports_memmap_on_memory(size)) { mhp_altmap.free = memory_block_memmap_on_memory_pages(); - params.altmap = &mhp_altmap; + params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL); + if (!params.altmap) { + ret = -ENOMEM; + goto error; + } + + memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap)); } /* fallback to not using altmap */ } @@ -1447,13 +1453,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) - goto error; + goto error_free; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.free, group); + ret = create_memory_block_devices(start, size, params.altmap, group); if (ret) { arch_remove_memory(start, size, NULL); - goto error; + goto error_free; } if (new_node) { @@ -1490,6 +1496,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; +error_free: + kfree(params.altmap); error: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); @@ -2056,12 +2064,18 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +static int test_has_altmap_cb(struct memory_block *mem, void *arg) { + struct memory_block **mem_ptr = (struct memory_block **)arg; /* - * If not set, continue with the next block. + * return the memblock if we have altmap + * and break callback. */ - return mem->nr_vmemmap_pages; + if (mem->altmap) { + *mem_ptr = mem; + return 1; + } + return 0; } static int check_cpu_on_node(int nid) @@ -2136,10 +2150,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(u64 start, u64 size) { - struct vmem_altmap mhp_altmap = {}; - struct vmem_altmap *altmap = NULL; - unsigned long nr_vmemmap_pages; + struct memory_block *mem; int rc = 0, nid = NUMA_NO_NODE; + struct vmem_altmap *altmap = NULL; BUG_ON(check_hotplug_memory_range(start, size)); @@ -2161,25 +2174,20 @@ static int __ref try_remove_memory(u64 start, u64 size) * the same granularity it was added - a single memory block. */ if (mhp_memmap_on_memory()) { - nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, - get_nr_vmemmap_pages_cb); - if (nr_vmemmap_pages) { + rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb); + if (rc) { if (size != memory_block_size_bytes()) { pr_warn("Refuse to remove %#llx - %#llx," "wrong granularity\n", start, start + size); return -EINVAL; } - + altmap = mem->altmap; /* - * Let remove_pmd_table->free_hugepage_table do the - * right thing if we used vmem_altmap when hot-adding - * the range. + * Mark altmap NULL so that we can add a debug + * check on memblock free. */ - mhp_altmap.base_pfn = PHYS_PFN(start); - mhp_altmap.free = nr_vmemmap_pages; - mhp_altmap.alloc = nr_vmemmap_pages; - altmap = &mhp_altmap; + mem->altmap = NULL; } } @@ -2196,6 +2204,12 @@ static int __ref try_remove_memory(u64 start, u64 size) arch_remove_memory(start, size, altmap); + /* Verify that all vmemmap pages have actually been freed. */ + if (altmap) { + WARN(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); + } + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_phys_free(start, size); memblock_remove(start, size); -- cgit v1.2.3 From f142b2c2530c1383a45e1ada1d641974b9723a35 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 18:07:53 +0800 Subject: mm/page_alloc: remove track of active PCP lists range in bulk free Patch series "Two minor cleanups for pcp list in page_alloc". There are two minor cleanups for pcp list in page_alloc. More details can be found in respective patches. This patch (of 2): After commit fd56eef258a17 ("mm/page_alloc: simplify how many pages are selected per pcp list during bulk free"), we will drain all pages in selected pcp list. And we ensured passed count is < pcp->count. Then, the search will finish before wrap-around and track of active PCP lists range intended for wrap-around case is no longer needed. Link: https://lkml.kernel.org/r/20230809100754.3094517-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230809100754.3094517-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94f9e159a18d..bc782bffaf02 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1207,8 +1207,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int pindex) { unsigned long flags; - int min_pindex = 0; - int max_pindex = NR_PCP_LISTS - 1; unsigned int order; bool isolated_pageblocks; struct page *page; @@ -1231,17 +1229,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Remove pages from lists in a round-robin fashion. */ do { - if (++pindex > max_pindex) - pindex = min_pindex; + if (++pindex > NR_PCP_LISTS - 1) + pindex = 0; list = &pcp->lists[pindex]; - if (!list_empty(list)) - break; - - if (pindex == max_pindex) - max_pindex--; - if (pindex == min_pindex) - min_pindex++; - } while (1); + } while (list_empty(list)); order = pindex_to_order(pindex); nr_pages = 1 << order; -- cgit v1.2.3 From 1305870529d9e16170bb744148aab6dffb19bb23 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 18:07:54 +0800 Subject: mm/page_alloc: remove unnecessary parameter batch of nr_pcp_free We get batch from pcp and just pass it to nr_pcp_free immediately. Get batch from pcp inside nr_pcp_free to remove unnecessary parameter batch. Link: https://lkml.kernel.org/r/20230809100754.3094517-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc782bffaf02..cfadde2fe3a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2340,10 +2340,10 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, - bool free_high) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) { int min_nr_free, max_nr_free; + int batch = READ_ONCE(pcp->batch); /* Free everything if batch freeing high-order pages. */ if (unlikely(free_high)) @@ -2410,9 +2410,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { - int batch = READ_ONCE(pcp->batch); - - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); } } -- cgit v1.2.3 From 8fbb92bd10be26d0feec6bc35332159145c27cc0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 17:49:10 +0800 Subject: mm/compaction: remove unused parameter pgdata of fragmentation_score_wmark Parameter pgdat is not used in fragmentation_score_wmark. Just remove it. Link: https://lkml.kernel.org/r/20230809094910.3092446-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Reviewed-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index fe7b4e7c5d24..216081ab325a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2140,7 +2140,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) return score; } -static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +static unsigned int fragmentation_score_wmark(bool low) { unsigned int wmark_low; @@ -2160,7 +2160,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat) if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) return false; - wmark_high = fragmentation_score_wmark(pgdat, false); + wmark_high = fragmentation_score_wmark(false); return fragmentation_score_node(pgdat) > wmark_high; } @@ -2199,7 +2199,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; score = fragmentation_score_zone(cc->zone); - wmark_low = fragmentation_score_wmark(pgdat, true); + wmark_low = fragmentation_score_wmark(true); if (score > wmark_low) ret = COMPACT_CONTINUE; -- cgit v1.2.3 From b7108d66318abf3e060c7839eabcba52e9461568 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Wed, 9 Aug 2023 13:35:44 +0530 Subject: Multi-gen LRU: skip CMA pages when they are not eligible This patch is based on the commit 5da226dbfce3("mm: skip CMA pages when they are not available") which skips cma pages reclaim when they are not eligible for the current allocation context. In mglru, such pages are added to the tail of the immediate generation to maintain better LRU order, which is unlike the case of conventional LRU where such pages are directly added to the head of the LRU list(akin to adding to head of the youngest generation in mglru). No observable issue without this patch on MGLRU, but logically it make sense to skip the CMA page reclaim when those pages can't be satisfied for the current allocation context. Link: https://lkml.kernel.org/r/1691568344-13475-1-git-send-email-quic_charante@quicinc.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Charan Teja Kalla Reviewed-by: Kalesh Singh Cc: David Hildenbrand Cc: Suren Baghdasaryan Cc: Yu Zhao Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b4329f93a682..6cbe921ef662 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4943,7 +4943,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; -- cgit v1.2.3 From 368d983b985572e33422432d849f5956268bce21 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 9 Aug 2023 15:33:23 +0800 Subject: mm: page_alloc: remove unused parameter from reserve_highatomic_pageblock() Just remove the redundant parameter alloc_order from reserve_highatomic_pageblock(). No functional modification involved. Link: https://lkml.kernel.org/r/20230809073323.1065286-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Kefeng Wang Cc: Mel Gorman Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfadde2fe3a2..a5d14ca13946 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1899,8 +1899,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * Reserve a pageblock for exclusive use of high-order atomic allocations if * there are no empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, - unsigned int alloc_order) +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -3210,7 +3209,7 @@ try_this_zone: * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone, order); + reserve_highatomic_pageblock(page, zone); return page; } else { -- cgit v1.2.3 From a04d12c2481fbf2752b5686d8a8049dd59e61e37 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 11 Aug 2023 19:59:44 +0800 Subject: mm/page_alloc: remove unnecessary inner __get_pfnblock_flags_mask Patch series "Two minor cleanups for get pageblock migratetype". This series contains two minor cleanups for get pageblock migratetype. More details can be found in respective patches. This patch (of 2): get_pfnblock_flags_mask() just calls inline inner __get_pfnblock_flags_mask without any extra work. Just opencode __get_pfnblock_flags_mask in get_pfnblock_flags_mask and replace call to __get_pfnblock_flags_mask with call to get_pfnblock_flags_mask to remove unnecessary __get_pfnblock_flags_mask. Link: https://lkml.kernel.org/r/20230811115945.3423894-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230811115945.3423894-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Mel Gorman Reviewed-by: Matthew Wilcox (Oracle) Cc: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/page_alloc.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5d14ca13946..5e67fe937e19 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -371,10 +371,16 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } -static __always_inline -unsigned long __get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, - unsigned long mask) +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { unsigned long *bitmap; unsigned long bitidx, word_bitidx; @@ -393,24 +399,10 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, return (word >> bitidx) & mask; } -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, unsigned long mask) -{ - return __get_pfnblock_flags_mask(page, pfn, mask); -} - static __always_inline int get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); + return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } /** -- cgit v1.2.3 From b5ffd2973365298c4d829802653133038837a6f9 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 11 Aug 2023 19:59:45 +0800 Subject: mm/page_alloc: use get_pfnblock_migratetype to avoid extra page_to_pfn We have get_pageblock_migratetype and get_pfnblock_migratetype to get migratetype of page. get_pfnblock_migratetype accepts both page and pfn from caller while get_pageblock_migratetype only accept page and get pfn with page_to_pfn from page. In case we already record pfn of page, we can simply call get_pfnblock_migratetype to avoid a page_to_pfn. Link: https://lkml.kernel.org/r/20230811115945.3423894-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Mel Gorman Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e67fe937e19..986b56db96b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -813,7 +813,7 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pageblock_migratetype(buddy); + int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); if (migratetype != buddy_mt && (!migratetype_is_mergeable(migratetype) || @@ -889,7 +889,7 @@ int split_free_page(struct page *free_page, goto out; } - mt = get_pageblock_migratetype(free_page); + mt = get_pfnblock_migratetype(free_page, free_page_pfn); if (likely(!is_migrate_isolate(mt))) __mod_zone_freepage_state(zone, -(1UL << order), mt); -- cgit v1.2.3 From e1dea6d3c68113ac5d15a762e0c93e811e569739 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 14 Aug 2023 22:11:42 +0000 Subject: mm/z3fold: remove obsolete comment for struct z3fold_pool Since commit e774a7bc7f0a ("mm: zswap: remove page reclaim logic from z3fold"), zpool and zpool_ops have been removed, so also remove the corresponding comments. Link: https://lkml.kernel.org/r/20230814221142.486548-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Reviewed-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/z3fold.c b/mm/z3fold.c index 7952adf9bede..7c76b396b74c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -133,8 +133,6 @@ struct z3fold_header { * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation - * @zpool: zpool driver - * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release -- cgit v1.2.3 From f5ecca06b3a5d0371ee27ee08aa06c686407a8af Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:47 -0700 Subject: mm: convert ptlock_alloc() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/memory.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fee233dfccc..ccea0665247c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2826,7 +2826,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); -extern bool ptlock_alloc(struct page *page); +bool ptlock_alloc(struct ptdesc *ptdesc); extern void ptlock_free(struct page *page); static inline spinlock_t *ptlock_ptr(struct page *page) @@ -2838,7 +2838,7 @@ static inline void ptlock_cache_init(void) { } -static inline bool ptlock_alloc(struct page *page) +static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } @@ -2868,7 +2868,7 @@ static inline bool ptlock_init(struct page *page) * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page)) + if (!ptlock_alloc(page_ptdesc(page))) return false; spin_lock_init(ptlock_ptr(page)); return true; diff --git a/mm/memory.c b/mm/memory.c index 039dcbbcc7d2..b9ba7e99534d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6114,14 +6114,14 @@ void __init ptlock_cache_init(void) SLAB_PANIC, NULL); } -bool ptlock_alloc(struct page *page) +bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; - page->ptl = ptl; + ptdesc->ptl = ptl; return true; } -- cgit v1.2.3 From 6ed1b8a09deb0b99fd3b54e11535c80284689555 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:52 -0700 Subject: mm: convert ptlock_free() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-11-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- mm/memory.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8884c700dfc6..d0fb31bcd482 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2827,7 +2827,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); -extern void ptlock_free(struct page *page); +void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { @@ -2843,7 +2843,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc) return true; } -static inline void ptlock_free(struct page *page) +static inline void ptlock_free(struct ptdesc *ptdesc) { } @@ -2884,7 +2884,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void ptlock_free(struct page *page) {} +static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_pte_page_ctor(struct page *page) @@ -2898,7 +2898,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) static inline void pgtable_pte_page_dtor(struct page *page) { - ptlock_free(page); + ptlock_free(page_ptdesc(page)); __ClearPageTable(page); dec_lruvec_page_state(page, NR_PAGETABLE); } @@ -2972,7 +2972,7 @@ static inline void pmd_ptlock_free(struct ptdesc *ptdesc) #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif - ptlock_free(ptdesc_page(ptdesc)); + ptlock_free(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) diff --git a/mm/memory.c b/mm/memory.c index b9ba7e99534d..4a7c8be9fe71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6125,8 +6125,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) return true; } -void ptlock_free(struct page *page) +void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, page->ptl); + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif -- cgit v1.2.3 From 202e14222fadb246dfdf182e67de1518e86a1e20 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:58 +1000 Subject: memfd: do not -EACCES old memfd_create() users with vm.memfd_noexec=2 Given the difficulty of auditing all of userspace to figure out whether every memfd_create() user has switched to passing MFD_EXEC and MFD_NOEXEC_SEAL flags, it seems far less distruptive to make it possible for older programs that don't make use of executable memfds to run under vm.memfd_noexec=2. Otherwise, a small dependency change can result in spurious errors. For programs that don't use executable memfds, passing MFD_NOEXEC_SEAL is functionally a no-op and thus having the same In addition, every failure under vm.memfd_noexec=2 needs to print to the kernel log so that userspace can figure out where the error came from. The concerns about pr_warn_ratelimited() spam that caused the switch to pr_warn_once()[1,2] do not apply to the vm.memfd_noexec=2 case. This is a user-visible API change, but as it allows programs to do something that would be blocked before, and the sysctl itself was broken and recently released, it seems unlikely this will cause any issues. [1]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [2]: https://lore.kernel.org/202212161233.85C9783FB@keescook/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-2-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 16 ++++------------ mm/memfd.c | 30 +++++++++++------------------- tools/testing/selftests/memfd/memfd_test.c | 22 +++++++++++++++++----- 3 files changed, 32 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c758809d5bcf..53974d79d98e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -17,18 +17,10 @@ struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -/* - * sysctl for vm.memfd_noexec - * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_EXEC was set. - * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_NOEXEC_SEAL was set. - * 2: memfd_create() without MFD_NOEXEC_SEAL will be - * rejected. - */ -#define MEMFD_NOEXEC_SCOPE_EXEC 0 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +/* modes for vm.memfd_noexec sysctl */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { diff --git a/mm/memfd.c b/mm/memfd.c index 0bdbd2335af7..d65485c762de 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -271,30 +271,22 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL - char comm[TASK_COMM_LEN]; - int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; - struct pid_namespace *ns; - - ns = task_active_pid_ns(current); - if (ns) - sysctl = ns->memfd_noexec_scope; + int sysctl = task_active_pid_ns(current)->memfd_noexec_scope; if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - if (sysctl == MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) + if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } - if (*flags & MFD_EXEC && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { - pr_warn_once( - "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); - + if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { + pr_err_ratelimited( + "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", + current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif - return 0; } @@ -302,7 +294,6 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - char comm[TASK_COMM_LEN]; unsigned int *file_seals; struct file *file; int fd, error; @@ -325,12 +316,13 @@ SYSCALL_DEFINE2(memfd_create, if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { pr_warn_once( - "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); + "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", + current->comm, task_pid_nr(current)); } - if (check_sysctl_memfd_noexec(&flags) < 0) - return -EACCES; + error = check_sysctl_memfd_noexec(&flags); + if (error < 0) + return error; /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 8eb49204f9ea..8b7390ad81d1 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1145,11 +1145,23 @@ static void test_sysctl_child(void) printf("%s sysctl 2\n", memfd_str); sysctl_assert_write("2"); - mfd_fail_new("kern_memfd_sysctl_2", - MFD_CLOEXEC | MFD_ALLOW_SEALING); - mfd_fail_new("kern_memfd_sysctl_2_MFD_EXEC", - MFD_CLOEXEC | MFD_EXEC); - fd = mfd_assert_new("", 0, MFD_NOEXEC_SEAL); + mfd_fail_new("kern_memfd_sysctl_2_exec", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); + + fd = mfd_assert_new("kern_memfd_sysctl_2_dfl", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); + + fd = mfd_assert_new("kern_memfd_sysctl_2_noexec_seal", + mfd_def_size, + MFD_NOEXEC_SEAL | MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); close(fd); sysctl_fail_write("0"); -- cgit v1.2.3 From 434ed3350f57c03a9654fe0619755cc137a58935 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:59 +1000 Subject: memfd: improve userspace warnings for missing exec-related flags In order to incentivise userspace to switch to passing MFD_EXEC and MFD_NOEXEC_SEAL, we need to provide a warning on each attempt to call memfd_create() without the new flags. pr_warn_once() is not useful because on most systems the one warning is burned up during the boot process (on my system, systemd does this within the first second of boot) and thus userspace will in practice never see the warnings to push them to switch to the new flags. The original patchset[1] used pr_warn_ratelimited(), however there were concerns about the degree of spam in the kernel log[2,3]. The resulting inability to detect every case was flagged as an issue at the time[4]. While we could come up with an alternative rate-limiting scheme such as only outputting the message if vm.memfd_noexec has been modified, or only outputting the message once for a given task, these alternatives have downsides that don't make sense given how low-stakes a single kernel warning message is. Switching to pr_info_ratelimited() instead should be fine -- it's possible some monitoring tool will be unhappy with a stream of warning-level messages but there's already plenty of info-level message spam in dmesg. [1]: https://lore.kernel.org/20221215001205.51969-4-jeffxu@google.com/ [2]: https://lore.kernel.org/202212161233.85C9783FB@keescook/ [3]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [4]: https://lore.kernel.org/f185bb42-b29c-977e-312e-3349eea15383@linuxfoundation.org/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-3-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Christian Brauner Cc: Daniel Verkamp Cc: Dominique Martinet Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memfd.c b/mm/memfd.c index d65485c762de..aa46521057ab 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -315,7 +315,7 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - pr_warn_once( + pr_info_ratelimited( "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", current->comm, task_pid_nr(current)); } -- cgit v1.2.3 From 9876cfe8ec1cb3c88de31f4d58d57b0e7e22bcc4 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:41:00 +1000 Subject: memfd: replace ratcheting feature from vm.memfd_noexec with hierarchy This sysctl has the very unusual behaviour of not allowing any user (even CAP_SYS_ADMIN) to reduce the restriction setting, meaning that if you were to set this sysctl to a more restrictive option in the host pidns you would need to reboot your machine in order to reset it. The justification given in [1] is that this is a security feature and thus it should not be possible to disable. Aside from the fact that we have plenty of security-related sysctls that can be disabled after being enabled (fs.protected_symlinks for instance), the protection provided by the sysctl is to stop users from being able to create a binary and then execute it. A user with CAP_SYS_ADMIN can trivially do this without memfd_create(2): % cat mount-memfd.c #include #include #include #include #include #include #define SHELLCODE "#!/bin/echo this file was executed from this totally private tmpfs:" int main(void) { int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); assert(fsfd >= 0); assert(!fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 2)); int dfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); assert(dfd >= 0); int execfd = openat(dfd, "exe", O_CREAT | O_RDWR | O_CLOEXEC, 0782); assert(execfd >= 0); assert(write(execfd, SHELLCODE, strlen(SHELLCODE)) == strlen(SHELLCODE)); assert(!close(execfd)); char *execpath = NULL; char *argv[] = { "bad-exe", NULL }, *envp[] = { NULL }; execfd = openat(dfd, "exe", O_PATH | O_CLOEXEC); assert(execfd >= 0); assert(asprintf(&execpath, "/proc/self/fd/%d", execfd) > 0); assert(!execve(execpath, argv, envp)); } % ./mount-memfd this file was executed from this totally private tmpfs: /proc/self/fd/5 % Given that it is possible for CAP_SYS_ADMIN users to create executable binaries without memfd_create(2) and without touching the host filesystem (not to mention the many other things a CAP_SYS_ADMIN process would be able to do that would be equivalent or worse), it seems strange to cause a fair amount of headache to admins when there doesn't appear to be an actual security benefit to blocking this. There appear to be concerns about confused-deputy-esque attacks[2] but a confused deputy that can write to arbitrary sysctls is a bigger security issue than executable memfds. /* New API */ The primary requirement from the original author appears to be more based on the need to be able to restrict an entire system in a hierarchical manner[3], such that child namespaces cannot re-enable executable memfds. So, implement that behaviour explicitly -- the vm.memfd_noexec scope is evaluated up the pidns tree to &init_pid_ns and you have the most restrictive value applied to you. The new lower limit you can set vm.memfd_noexec is whatever limit applies to your parent. Note that a pidns will inherit a copy of the parent pidns's effective vm.memfd_noexec setting at unshare() time. This matches the existing behaviour, and it also ensures that a pidns will never have its vm.memfd_noexec setting *lowered* behind its back (but it will be raised if the parent raises theirs). /* Backwards Compatibility */ As the previous version of the sysctl didn't allow you to lower the setting at all, there are no backwards compatibility issues with this aspect of the change. However it should be noted that now that the setting is completely hierarchical. Previously, a cloned pidns would just copy the current pidns setting, meaning that if the parent's vm.memfd_noexec was changed it wouldn't propoagate to existing pid namespaces. Now, the restriction applies recursively. This is a uAPI change, however: * The sysctl is very new, having been merged in 6.3. * Several aspects of the sysctl were broken up until this patchset and the other patchset by Jeff Xu last month. And thus it seems incredibly unlikely that any real users would run into this issue. In the worst case, if this causes userspace isues we could make it so that modifying the setting follows the hierarchical rules but the restriction checking uses the cached copy. [1]: https://lore.kernel.org/CABi2SkWnAgHK1i6iqSqPMYuNEhtHBkO8jUuCvmG3RmUB5TKHJw@mail.gmail.com/ [2]: https://lore.kernel.org/CALmYWFs_dNCzw_pW1yRAo4bGCPEtykroEQaowNULp7svwMLjOg@mail.gmail.com/ [3]: https://lore.kernel.org/CALmYWFuahdUF7cT4cm7_TGLqPanuHXJ-hVSfZt7vpTnc18DPrw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-4-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 23 ++++++++++++++++++++++- kernel/pid.c | 3 +++ kernel/pid_namespace.c | 6 +++--- kernel/pid_sysctl.h | 28 ++++++++++++---------------- mm/memfd.c | 3 ++- 5 files changed, 42 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 53974d79d98e..f9f9931e02d6 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -39,7 +39,6 @@ struct pid_namespace { int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) - /* sysctl for vm.memfd_noexec */ int memfd_noexec_scope; #endif } __randomize_layout; @@ -56,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + int scope = MEMFD_NOEXEC_SCOPE_EXEC; + + for (; ns; ns = ns->parent) + scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); + + return scope; +} +#else +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} +#endif + extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); @@ -70,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} + static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { diff --git a/kernel/pid.c b/kernel/pid.c index 6a1d23a11026..fee14a4486a3 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = { #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0bf44afe04dd..619972c78774 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - - initialize_memfd_noexec_scope(ns); - +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); +#endif return ns; out_free_idr: diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index b26e027fc9cd..2ee41a3a1dfd 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -5,33 +5,30 @@ #include #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) -{ - ns->memfd_noexec_scope = - task_active_pid_ns(current)->memfd_noexec_scope; -} - static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos) { struct pid_namespace *ns = task_active_pid_ns(current); struct ctl_table table_copy; + int err, scope, parent_scope; if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; table_copy = *table; - if (ns != &init_pid_ns) - table_copy.data = &ns->memfd_noexec_scope; - /* - * set minimum to current value, the effect is only bigger - * value is accepted. - */ - if (*(int *)table_copy.data > *(int *)table_copy.extra1) - table_copy.extra1 = table_copy.data; + /* You cannot set a lower enforcement value than your parent. */ + parent_scope = pidns_memfd_noexec_scope(ns->parent); + /* Equivalent to pidns_memfd_noexec_scope(ns). */ + scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope); + + table_copy.data = &scope; + table_copy.extra1 = &parent_scope; - return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + if (!err && write) + WRITE_ONCE(ns->memfd_noexec_scope, scope); + return err; } static struct ctl_table pid_ns_ctl_table_vm[] = { @@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void) register_sysctl("vm", pid_ns_ctl_table_vm); } #else -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} static inline void register_pid_ns_sysctl_table_vm(void) {} #endif diff --git a/mm/memfd.c b/mm/memfd.c index aa46521057ab..1cad1904fc26 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -271,7 +271,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL - int sysctl = task_active_pid_ns(current)->memfd_noexec_scope; + struct pid_namespace *ns = task_active_pid_ns(current); + int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) -- cgit v1.2.3 From 8dbbc49345a7452ee59783ed5f5637b5d59532ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Aug 2023 13:00:18 -0700 Subject: mm,thp: no space after colon in Mem-Info fields Patch series "mm,thp: fix sloppy text output". Three independent trivial patches, fixing sloppy text output which has annoyed me; but might risk surprising a parser, so any can be dropped. This patch (of 3): The SysRq-m or OOM Mem-Info dmesg showed (long lines containing) ... shmem:NkB shmem_thp: NkB shmem_pmdmapped: NkB anon_thp: NkB ... Delete the space after the colon after shmem_thp, shmem_pmdmapped, anon_thp: as the shmem example shows, no other fields have a space after the colon in this output. Link: https://lkml.kernel.org/r/dc264fd6-40bb-6510-db36-9340a5f01d94@google.com Link: https://lkml.kernel.org/r/c1edd7da-5493-c542-6feb-92452b4dab3b@google.com Signed-off-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- mm/show_mem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/show_mem.c b/mm/show_mem.c index 09c7d036d49e..4b888b18bdde 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -251,9 +251,9 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " writeback:%lukB" " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" + " shmem_thp:%lukB" + " shmem_pmdmapped:%lukB" + " anon_thp:%lukB" #endif " writeback_tmp:%lukB" " kernel_stack:%lukB" -- cgit v1.2.3 From 7e2fca52ef918e5c983391f984ed5c98b0dea6a1 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 12 Aug 2023 14:26:12 +0800 Subject: mm/secretmem: use a folio in secretmem_fault() Saves four implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230812062612.3184990-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/secretmem.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/secretmem.c b/mm/secretmem.c index 86442a15d12f..3afb5ad701e1 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -55,6 +55,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; + struct folio *folio; vm_fault_t ret; int err; @@ -66,23 +67,24 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) retry: page = find_lock_page(mapping, offset); if (!page) { - page = alloc_page(gfp | __GFP_ZERO); - if (!page) { + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (!folio) { ret = VM_FAULT_OOM; goto out; } + page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { - put_page(page); + folio_put(folio); ret = vmf_error(err); goto out; } - __SetPageUptodate(page); - err = add_to_page_cache_lru(page, mapping, offset, gfp); + __folio_mark_uptodate(folio); + err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid -- cgit v1.2.3 From 0790e1e2b1b71ba357e89e779451efe79dff28e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 12 Aug 2023 01:20:33 +0100 Subject: mm: allow fault_dirty_shared_page() to be called under the VMA lock By making maybe_unlock_mmap_for_io() handle the VMA lock correctly, we make fault_dirty_shared_page() safe to be called without the mmap lock held. Link: https://lkml.kernel.org/r/20230812002033.1002367-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: David Hildenbrand Tested-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index a037b1b37f6d..c6ed10f0a5ad 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -706,7 +706,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); } return fpin; } -- cgit v1.2.3 From b348b5fe2b5f14ac8bb64fe271d7a027db8cc674 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 11 Aug 2023 12:36:55 -0700 Subject: mm/ksm: add pages scanned metric ksm currently maintains several statistics, which let you determine how successful KSM is at sharing pages. However it does not contain a metric to determine how much work it does. This commit adds the pages scanned metric. This allows the administrator to determine how many pages have been scanned over a period of time. Link: https://lkml.kernel.org/r/20230811193655.2518943-1-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 2 ++ mm/ksm.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 5c5be7bd84b8..776f244bdae4 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -159,6 +159,8 @@ The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``: general_profit how effective is KSM. The calculation is explained below. +pages_scanned + how many pages are being scanned for ksm pages_shared how many shared pages are being used pages_sharing diff --git a/mm/ksm.c b/mm/ksm.c index 97a9627116fa..2653099539f2 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -242,6 +242,9 @@ static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; +/* The number of pages scanned */ +static unsigned long ksm_pages_scanned; + /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; @@ -2476,8 +2479,9 @@ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; + unsigned int npages = scan_npages; - while (scan_npages-- && likely(!freezing(current))) { + while (npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -2485,6 +2489,8 @@ static void ksm_do_scan(unsigned int scan_npages) cmp_and_merge_page(page, rmap_item); put_page(page); } + + ksm_pages_scanned += scan_npages - npages; } static int ksmd_should_run(void) @@ -3323,6 +3329,13 @@ static ssize_t max_page_sharing_store(struct kobject *kobj, } KSM_ATTR(max_page_sharing); +static ssize_t pages_scanned_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); +} +KSM_ATTR_RO(pages_scanned); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3431,6 +3444,7 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, + &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, -- cgit v1.2.3 From 835bc157da689f834d10296d0942aed88b310b4f Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Tue, 15 Aug 2023 22:41:27 +0800 Subject: mm/kmemleak: use object_cache instead of kmemleak_initialized to check in set_track_prepare() Patch series "mm/kmemleak: use object_cache instead of kmemleak_initialized", v3. Use object_cache instead of kmemleak_initialized to check in set_track_prepare(), so that memory leaks after kmemleak_init() can be recorded and Rename kmemleak_initialized to kmemleak_late_initialized unreferenced object 0xc674ca80 (size 64): comm "swapper/0", pid 1, jiffies 4294938337 (age 204.880s) hex dump (first 32 bytes): 80 55 75 c6 80 54 75 c6 00 55 75 c6 80 52 75 c6 .Uu..Tu..Uu..Ru. 00 53 75 c6 00 00 00 00 00 00 00 00 00 00 00 00 .Su.......... This patch (of 2): kmemleak_initialized is set in kmemleak_late_init(), which also means that there is no call trace which object's memory leak is before kmemleak_late_init(), so use object_cache instead of kmemleak_initialized to check in set_track_prepare() to avoid no call trace records when there is a memory leak in the code between kmemleak_init() and kmemleak_late_init(). unreferenced object 0xc674ca80 (size 64): comm "swapper/0", pid 1, jiffies 4294938337 (age 204.880s) hex dump (first 32 bytes): 80 55 75 c6 80 54 75 c6 00 55 75 c6 80 52 75 c6 .Uu..Tu..Uu..Ru. 00 53 75 c6 00 00 00 00 00 00 00 00 00 00 00 00 .Su.......... Link: https://lkml.kernel.org/r/20230815144128.3623103-1-xiaolei.wang@windriver.com Link: https://lkml.kernel.org/r/20230815144128.3623103-2-xiaolei.wang@windriver.com Fixes: 56a61617dd22 ("mm: use stack_depot for recording kmemleak's backtrace") Signed-off-by: Xiaolei Wang Reviewed-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a2d34226e3c8..16fc7b0984b9 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -610,7 +610,12 @@ static noinline depot_stack_handle_t set_track_prepare(void) unsigned long entries[MAX_TRACE]; unsigned int nr_entries; - if (!kmemleak_initialized) + /* + * Use object_cache to determine whether kmemleak_init() has + * been invoked. stack_depot_early_init() is called before + * kmemleak_init() in mm_core_init(). + */ + if (!object_cache) return 0; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); -- cgit v1.2.3 From d160ef71b42c53573fc27188c20270a2ccdcc196 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Tue, 15 Aug 2023 22:41:28 +0800 Subject: Rename kmemleak_initialized to kmemleak_late_initialized The old name is confusing because it implies the completion of earlier kmemleak_init(), the new name update to kmemleak_late_initial represents the completion of kmemleak_late_init(). No functional changes. Link: https://lkml.kernel.org/r/20230815144128.3623103-3-xiaolei.wang@windriver.com Signed-off-by: Xiaolei Wang Acked-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 16fc7b0984b9..2918150e31bd 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -218,7 +218,7 @@ static int kmemleak_enabled = 1; /* same as above but only for the kmemleak_free() callback */ static int kmemleak_free_enabled = 1; /* set in the late_initcall if there were no errors */ -static int kmemleak_initialized; +static int kmemleak_late_initialized; /* set if a kmemleak warning was issued */ static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ @@ -2057,7 +2057,7 @@ static void kmemleak_disable(void) kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (kmemleak_initialized) + if (kmemleak_late_initialized) schedule_work(&cleanup_work); else kmemleak_free_enabled = 0; @@ -2122,7 +2122,7 @@ void __init kmemleak_init(void) */ static int __init kmemleak_late_init(void) { - kmemleak_initialized = 1; + kmemleak_late_initialized = 1; debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); @@ -2130,7 +2130,7 @@ static int __init kmemleak_late_init(void) /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately - * after setting kmemleak_initialized and we may end up with + * after setting kmemleak_late_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ schedule_work(&cleanup_work); -- cgit v1.2.3 From 7acddcc1ae30670449d60dc9da5b00d544a5b58b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:05 +0200 Subject: mm/gup: don't implicitly set FOLL_HONOR_NUMA_FAULT Commit 0b9d705297b2 ("mm: numa: Support NUMA hinting page faults from gup/gup_fast") from 2012 documented as the primary reason why we would want to handle NUMA hinting faults from GUP: KVM secondary MMU page faults will trigger the NUMA hinting page faults through gup_fast -> get_user_pages -> follow_page -> handle_mm_fault. That is still the case today, and relevant KVM code has been converted to manually set FOLL_HONOR_NUMA_FAULT. So let's stop setting FOLL_HONOR_NUMA_FAULT for all GUP users and cross fingers that not that many other ones that really require such handling for autonuma remain. Possible interaction with MMU notifiers: Assume a driver obtains a page using get_user_pages() to map it into a secondary MMU, and uses the MMU notifier framework to get notified on changes. Assume get_user_pages() succeeded on a PROT_NONE-mapped page (because FOLL_HONOR_NUMA_FAULT is not set) in an accessible VMA and the page is mapped into a secondary MMU. Once user space would turn that mapping inaccessible using mprotect(PROT_NONE), the actual PTE in the page table might not change. If the MMU notifier would be smart and optimize for that case "why notify if the PTE didn't change", that could be problematic. At least change_pmd_range() with MMU_NOTIFY_PROTECTION_VMA for now does an unconditional mmu_notifier_invalidate_range_start() -> mmu_notifier_invalidate_range_end() and should be fine. Note that even if a PTE in an accessible VMA is pte_protnone(), the underlying page might be accessed by a secondary MMU that does not set FOLL_HONOR_NUMA_FAULT, and test_young() MMU notifiers would return "true". Link: https://lkml.kernel.org/r/20230803143208.383663-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Mel Gorman Cc: Paolo Bonzini Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/gup.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index 3bbfae411880..ee4fc15ce88e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2244,13 +2244,6 @@ static bool is_valid_gup_args(struct page **pages, int *locked, gup_flags |= FOLL_UNLOCKABLE; } - /* - * For now, always trigger NUMA hinting faults. Some GUP users like - * KVM require the hint to be as the calling context of GUP is - * functionally similar to a memory reference from task context. - */ - gup_flags |= FOLL_HONOR_NUMA_FAULT; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) -- cgit v1.2.3 From dd6fa0b61814492476463149c91110e529364e82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:50 +0100 Subject: mm: call free_huge_page() directly Indirect calls are expensive, thanks to Spectre. Call free_huge_page() directly if the folio belongs to hugetlb. Link: https://lkml.kernel.org/r/20230816151201.3655946-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 ++- mm/page_alloc.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0a393bc02f25..5a1dfaffbd80 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,6 +26,8 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif +void free_huge_page(struct page *page); + #ifdef CONFIG_HUGETLB_PAGE #include @@ -165,7 +167,6 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); -void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 986b56db96b5..744848593360 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,9 +287,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, -#ifdef CONFIG_HUGETLB_PAGE - [HUGETLB_PAGE_DTOR] = free_huge_page, -#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif @@ -612,6 +609,11 @@ void destroy_large_folio(struct folio *folio) { enum compound_dtor_id dtor = folio->_folio_dtor; + if (folio_test_hugetlb(folio)) { + free_huge_page(&folio->page); + return; + } + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); } -- cgit v1.2.3 From 454a00c40a21c59e99c526fe8cc57bd029cf8f0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:51 +0100 Subject: mm: convert free_huge_page() to free_huge_folio() Pass a folio instead of the head page to save a few instructions. Update the documentation, at least in English. Link: https://lkml.kernel.org/r/20230816151201.3655946-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Cc: Yanteng Si Cc: David Hildenbrand Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 14 +++---- .../translations/zh_CN/mm/hugetlbfs_reserv.rst | 4 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 48 +++++++++++----------- mm/page_alloc.c | 2 +- 5 files changed, 34 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index d9c2b0f01dcd..4914fbf07966 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -271,12 +271,12 @@ to the global reservation count (resv_huge_pages). Freeing Huge Pages ================== -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. +Huge pages are freed by free_huge_folio(). It is only passed a pointer +to the folio as it is called from the generic MM code. When a huge page +is freed, reservation accounting may need to be performed. This would +be the case if the page was associated with a subpool that contained +reserves, or the page is being freed on an error path where a global +reserve count must be restored. The page->private field points to any subpool associated with the page. If the PagePrivate flag is set, it indicates the global reserve count should @@ -525,7 +525,7 @@ However, there are several instances where errors are encountered after a huge page is allocated but before it is instantiated. In this case, the page allocation has consumed the reservation and made the appropriate subpool, reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page +time (before instantiation and clearing of PagePrivate), then free_huge_folio will increment the global reservation count. However, the reservation map indicates the reservation was consumed. This resulting inconsistent state will cause the 'leak' of a reserved huge page. The global reserve count will diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index b7a0544224ad..0f7e7fb5ca8c 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -219,7 +219,7 @@ vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_ 释放巨页 ======== -巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 +巨页释放是由函数free_huge_folio()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 @@ -387,7 +387,7 @@ region_count()在解除私有巨页映射时被调用。在私有映射中,预 然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, 页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 -(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 +(在实例化和清除PagePrivate之前),那么free_huge_folio将增加全局预留计数。然而,预留映射 显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, 并阻止分配一个预先分配的页面。 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5a1dfaffbd80..5b2626063f4f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,7 +26,7 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif -void free_huge_page(struct page *page); +void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f498e8025cc..6a3c80026ab3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1706,10 +1706,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, zeroed = folio_put_testzero(folio); if (unlikely(!zeroed)) /* - * It is VERY unlikely soneone else has taken a ref on - * the page. In this case, we simply return as the - * hugetlb destructor (free_huge_page) will be called - * when this other ref is dropped. + * It is VERY unlikely soneone else has taken a ref + * on the folio. In this case, we simply return as + * free_huge_folio() will be called when this other ref + * is dropped. */ return; @@ -1875,13 +1875,12 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -void free_huge_page(struct page *page) +void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct folio *folio = page_folio(page); struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); @@ -1936,7 +1935,7 @@ void free_huge_page(struct page *page) spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { - arch_clear_hugepage_flags(page); + arch_clear_hugepage_flags(&folio->page); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -2246,7 +2245,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) { - free_huge_page(&folio->page); /* free it into the hugepage allocator */ + free_huge_folio(folio); /* free it into the hugepage allocator */ return 1; } } @@ -2429,13 +2428,13 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse - * temporary page to workaround the nasty free_huge_page + * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - free_huge_page(&folio->page); + free_huge_folio(folio); return NULL; } @@ -2547,8 +2546,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); - struct folio *folio; - struct page *page, *tmp; + struct folio *folio, *tmp; int ret; long i; long needed, allocated; @@ -2608,21 +2606,21 @@ retry: ret = 0; /* Free the needed pages to the hugetlb pool */ - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_hugetlb_folio(h, page_folio(page)); + enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. - * Pages have no ref count, call free_huge_page directly. + * Pages have no ref count, call free_huge_folio directly. */ - list_for_each_entry_safe(page, tmp, &surplus_list, lru) - free_huge_page(page); + list_for_each_entry_safe(folio, tmp, &surplus_list, lru) + free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; @@ -2836,11 +2834,11 @@ static long vma_del_reservation(struct hstate *h, * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * - * In case 1, free_huge_page later in the error path will increment the - * global reserve count. But, free_huge_page does not have enough context + * In case 1, free_huge_folio later in the error path will increment the + * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. @@ -2856,7 +2854,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented - * by free_huge_page. This will make it appear + * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This @@ -3232,7 +3230,7 @@ static void __init gather_bootmem_prealloc(void) if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { WARN_ON(folio_test_reserved(folio)); prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_page(page); /* add to the hugepage allocator */ + free_huge_folio(folio); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ free_gigantic_folio(folio, huge_page_order(h)); @@ -3264,7 +3262,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) &node_states[N_MEMORY], NULL); if (!folio) break; - free_huge_page(&folio->page); /* free it into the hugepage allocator */ + free_huge_folio(folio); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3542,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, while (count > persistent_huge_pages(h)) { /* * If this allocation races such that we no longer need the - * page, free_huge_page will handle it by freeing the page + * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); @@ -3658,7 +3656,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) prep_compound_page(subpage, target_hstate->order); folio_change_private(inner_folio, NULL); prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_page(subpage); + free_huge_folio(inner_folio); } mutex_unlock(&target_hstate->resize_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 744848593360..30dc444436cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -610,7 +610,7 @@ void destroy_large_folio(struct folio *folio) enum compound_dtor_id dtor = folio->_folio_dtor; if (folio_test_hugetlb(folio)) { - free_huge_page(&folio->page); + free_huge_folio(folio); return; } -- cgit v1.2.3 From 8dc4a8f1e038189cb575f89bcd23364698b88cc1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:52 +0100 Subject: mm: convert free_transhuge_folio() to folio_undo_large_rmappable() Indirect calls are expensive, thanks to Spectre. Test for TRANSHUGE_PAGE_DTOR and destroy the folio appropriately. Move the free_compound_page() call into destroy_large_folio() to simplify later patches. Link: https://lkml.kernel.org/r/20230816151201.3655946-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 -- include/linux/mm.h | 2 -- mm/huge_memory.c | 22 +++++++++++----------- mm/internal.h | 2 ++ mm/page_alloc.c | 9 ++++++--- 5 files changed, 19 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e718dbe928ba..ceda26a20830 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -141,8 +141,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); -void free_transhuge_page(struct page *page); - bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 55eb2789794e..0d14e2045658 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1253,9 +1253,7 @@ enum compound_dtor_id { #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE TRANSHUGE_PAGE_DTOR, -#endif NR_COMPOUND_DTORS, }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 154c210892a1..b33456683b93 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2776,10 +2776,9 @@ out: return ret; } -void free_transhuge_page(struct page *page) +void folio_undo_large_rmappable(struct folio *folio) { - struct folio *folio = (struct folio *)page; - struct deferred_split *ds_queue = get_deferred_split_queue(folio); + struct deferred_split *ds_queue; unsigned long flags; /* @@ -2787,15 +2786,16 @@ void free_transhuge_page(struct page *page) * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ - if (data_race(!list_empty(&folio->_deferred_list))) { - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); - } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + if (data_race(list_empty(&folio->_deferred_list))) + return; + + ds_queue = get_deferred_split_queue(folio); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + list_del(&folio->_deferred_list); } - free_compound_page(page); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } void deferred_split_folio(struct folio *folio) diff --git a/mm/internal.h b/mm/internal.h index d99ffb473f90..30bbfcacc909 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -413,6 +413,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } +void folio_undo_large_rmappable(struct folio *folio); + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30dc444436cc..4047b5897443 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,9 +287,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, -#endif }; int min_free_kbytes = 1024; @@ -614,6 +611,12 @@ void destroy_large_folio(struct folio *folio) return; } + if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) { + folio_undo_large_rmappable(folio); + free_compound_page(&folio->page); + return; + } + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); } -- cgit v1.2.3 From da6e7bf3a0315025e4199d599bd31763f0df3b4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:53 +0100 Subject: mm: convert prep_transhuge_page() to folio_prep_large_rmappable() Match folio_undo_large_rmappable(), and move the casting from page to folio into the callers (which they were largely doing anyway). Link: https://lkml.kernel.org/r/20230816151201.3655946-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 4 +--- mm/khugepaged.c | 2 +- mm/mempolicy.c | 15 ++++++++------- mm/page_alloc.c | 7 ++++--- 5 files changed, 16 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ceda26a20830..fa0350b0812a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -140,7 +140,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -void prep_transhuge_page(struct page *page); +void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) @@ -280,7 +280,7 @@ static inline bool hugepage_vma_check(struct vm_area_struct *vma, return false; } -static inline void prep_transhuge_page(struct page *page) {} +static inline void folio_prep_large_rmappable(struct folio *folio) {} #define transparent_hugepage_flags 0UL diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b33456683b93..5817bf77f1f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -577,10 +577,8 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -void prep_transhuge_page(struct page *page) +void folio_prep_large_rmappable(struct folio *folio) { - struct folio *folio = (struct folio *)page; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9a6e0d507759..40d43eccdee8 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -896,7 +896,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, return false; } - prep_transhuge_page(*hpage); + folio_prep_large_rmappable((struct folio *)*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return true; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ec2eaceffd74..42b5567e3773 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2195,9 +2195,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); - if (page && order > 1) - prep_transhuge_page(page); folio = (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); goto out; } @@ -2208,9 +2208,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); - if (page && order > 1) - prep_transhuge_page(page); folio = (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); goto out; } @@ -2306,10 +2306,11 @@ EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned order) { struct page *page = alloc_pages(gfp | __GFP_COMP, order); + struct folio *folio = (struct folio *)page; - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); + return folio; } EXPORT_SYMBOL(folio_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4047b5897443..a97d6fa9cea0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4489,10 +4489,11 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, { struct page *page = __alloc_pages(gfp | __GFP_COMP, order, preferred_nid, nodemask); + struct folio *folio = (struct folio *)page; - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); + return folio; } EXPORT_SYMBOL(__folio_alloc); -- cgit v1.2.3 From 0f2f43fabb95192c73b19586ef7536d7ac7c2f8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:54 +0100 Subject: mm: remove free_compound_page() and the compound_page_dtors array The only remaining destructor is free_compound_page(). Inline it into destroy_large_folio() and remove the array it used to live in. Link: https://lkml.kernel.org/r/20230816151201.3655946-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- mm/page_alloc.c | 24 +++++------------------- 2 files changed, 5 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d14e2045658..0955b6b13fd0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,14 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -/* - * Compound pages have a destructor function. Provide a - * prototype for that function and accessor functions. - * These are _only_ valid on the head of a compound page. - */ -typedef void compound_page_dtor(struct page *); - -/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ enum compound_dtor_id { NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, @@ -1299,8 +1291,6 @@ static inline unsigned long thp_size(struct page *page) return PAGE_SIZE << thp_order(page); } -void free_compound_page(struct page *page); - #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a97d6fa9cea0..31fec31be31e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -284,11 +284,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { - [NULL_COMPOUND_DTOR] = NULL, - [COMPOUND_PAGE_DTOR] = free_compound_page, -}; - int min_free_kbytes = 1024; int user_min_free_kbytes = -1; static int watermark_boost_factor __read_mostly = 15000; @@ -577,19 +572,13 @@ static inline void free_the_page(struct page *page, unsigned int order) * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * The first tail page's ->compound_dtor holds the offset in array of compound - * page destructors. See compound_page_dtors. + * The first tail page's ->compound_dtor describes how to destroy the + * compound page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ -void free_compound_page(struct page *page) -{ - mem_cgroup_uncharge(page_folio(page)); - free_the_page(page, compound_order(page)); -} - void prep_compound_page(struct page *page, unsigned int order) { int i; @@ -611,14 +600,11 @@ void destroy_large_folio(struct folio *folio) return; } - if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) { + if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) folio_undo_large_rmappable(folio); - free_compound_page(&folio->page); - return; - } - VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); - compound_page_dtors[dtor](&folio->page); + mem_cgroup_uncharge(folio); + free_the_page(&folio->page, folio_order(folio)); } static inline void set_buddy_order(struct page *page, unsigned int order) -- cgit v1.2.3 From 9c5ccf2db04b8d7c3df363fdd4856c2b79ab2c6a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:55 +0100 Subject: mm: remove HUGETLB_PAGE_DTOR We can use a bit in page[1].flags to indicate that this folio belongs to hugetlb instead of using a value in page[1].dtors. That lets folio_test_hugetlb() become an inline function like it should be. We can also get rid of NULL_COMPOUND_DTOR. Link: https://lkml.kernel.org/r/20230816151201.3655946-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 10 ++---- include/linux/mm.h | 4 --- include/linux/page-flags.h | 43 ++++++++++++++++------ kernel/crash_core.c | 2 +- mm/hugetlb.c | 49 ++++---------------------- mm/page_alloc.c | 2 +- 6 files changed, 43 insertions(+), 67 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index c18d94fa6470..baa1c355741d 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -325,8 +325,8 @@ NR_FREE_PAGES On linux-2.6.21 or later, the number of free pages is in vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask ------------------------------------------------------------------------------- +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|PG_hugetlb +----------------------------------------------------------------------------------------- Page attributes. These flags are used to filter various unnecessary for dumping pages. @@ -338,12 +338,6 @@ More page attributes. These flags are used to filter various unnecessary for dumping pages. -HUGETLB_PAGE_DTOR ------------------ - -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile -excludes these pages. - x86_64 ====== diff --git a/include/linux/mm.h b/include/linux/mm.h index 0955b6b13fd0..e241f5ee4dc4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1240,11 +1240,7 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); enum compound_dtor_id { - NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, -#ifdef CONFIG_HUGETLB_PAGE - HUGETLB_PAGE_DTOR, -#endif TRANSHUGE_PAGE_DTOR, NR_COMPOUND_DTORS, }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9218028caf33..e9e7cc45352d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,15 +171,6 @@ enum pageflags { /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, -#ifdef CONFIG_MEMORY_FAILURE - /* - * Compound pages. Stored in first tail page's flags. - * Indicates that at least one subpage is hwpoisoned in the - * THP. - */ - PG_has_hwpoisoned = PG_error, -#endif - /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -190,6 +181,15 @@ enum pageflags { /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif + + /* + * Flags only valid for compound pages. Stored in first tail page's + * flags word. + */ + + /* At least one page in this folio has the hwpoison flag set */ + PG_has_hwpoisoned = PG_error, + PG_hugetlb = PG_active, }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -812,7 +812,23 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); -bool folio_test_hugetlb(struct folio *folio); +SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) +CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) + +/** + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs + * @folio: The folio to test. + * + * Context: Any context. Caller should have a reference on the folio to + * prevent it from being turned into a tail page. + * Return: True for hugetlbfs folios, false for anon folios or folios + * belonging to other filesystems. + */ +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return folio_test_large(folio) && + test_bit(PG_hugetlb, folio_flags(folio, 1)); +} #else TESTPAGEFLAG_FALSE(Huge, hugetlb) #endif @@ -1056,6 +1072,13 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) +/* + * Flags stored in the second page of a compound page. They may overlap + * the CHECK_AT_FREE flags above, so need to be cleared. + */ +#define PAGE_FLAGS_SECOND \ + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 90ce1dfd591c..dd5f87047d06 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -490,7 +490,7 @@ static int __init crash_save_vmcoreinfo_init(void) #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); + VMCOREINFO_NUMBER(PG_hugetlb); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6a3c80026ab3..a82c3104337e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1585,25 +1585,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h, { lockdep_assert_held(&hugetlb_lock); - /* - * Very subtle - * - * For non-gigantic pages set the destructor to the normal compound - * page dtor. This is needed in case someone takes an additional - * temporary ref to the page, and freeing is delayed until they drop - * their reference. - * - * For gigantic pages set the destructor to the null dtor. This - * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_folio will turn the folio into a - * simple group of pages. After this the destructor does not - * apply. - * - */ - if (hstate_is_gigantic(h)) - folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); - else - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); + folio_clear_hugetlb(folio); } /* @@ -1690,7 +1672,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; } - folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -1814,9 +1796,8 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to - * remove_hugetlb_folio() will call folio_set_compound_dtor - * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate() - * directly. + * remove_hugetlb_folio() will clear the hugetlb bit, so do + * not use folio_hstate() directly. */ h = size_to_hstate(page_size(page)); @@ -1955,7 +1936,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2070,28 +2051,10 @@ int PageHuge(struct page *page) if (!PageCompound(page)) return 0; folio = page_folio(page); - return folio->_folio_dtor == HUGETLB_PAGE_DTOR; + return folio_test_hugetlb(folio); } EXPORT_SYMBOL_GPL(PageHuge); -/** - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs - * @folio: The folio to test. - * - * Context: Any context. Caller should have a reference on the folio to - * prevent it from being turned into a tail page. - * Return: True for hugetlbfs folios, false for anon folios or folios - * belonging to other filesystems. - */ -bool folio_test_hugetlb(struct folio *folio) -{ - if (!folio_test_large(folio)) - return false; - - return folio->_folio_dtor == HUGETLB_PAGE_DTOR; -} -EXPORT_SYMBOL_GPL(folio_test_hugetlb); - /* * Find and lock address space (mapping) in write mode. * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31fec31be31e..d96dc6a3077a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1112,7 +1112,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); if (compound) - ClearPageHasHWPoisoned(page); + page[1].flags &= ~PAGE_FLAGS_SECOND; for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_page_prepare(page, page + i); -- cgit v1.2.3 From de53c05f2ae3d47d30db58e9c4e54e3bbc868377 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:56 +0100 Subject: mm: add large_rmappable page flag Stored in the first tail page's flags, this flag replaces the destructor. That removes the last of the destructors, so remove all references to folio_dtor and compound_dtor. Link: https://lkml.kernel.org/r/20230816151201.3655946-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 4 ++-- include/linux/mm.h | 13 ------------- include/linux/mm_types.h | 2 -- include/linux/page-flags.h | 7 ++++++- kernel/crash_core.c | 1 - mm/huge_memory.c | 4 ++-- mm/internal.h | 1 - mm/page_alloc.c | 7 +------ 8 files changed, 11 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index baa1c355741d..3bd38ac0e7de 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -141,8 +141,8 @@ nodemask_t The size of a nodemask_t type. Used to compute the number of online nodes. -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) -------------------------------------------------------------------------------------------------- +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_head) +---------------------------------------------------------------------------------- User-space tools compute their values based on the offset of these variables. The variables are used when excluding unnecessary pages. diff --git a/include/linux/mm.h b/include/linux/mm.h index e241f5ee4dc4..63d573781973 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,19 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -enum compound_dtor_id { - COMPOUND_PAGE_DTOR, - TRANSHUGE_PAGE_DTOR, - NR_COMPOUND_DTORS, -}; - -static inline void folio_set_compound_dtor(struct folio *folio, - enum compound_dtor_id compound_dtor) -{ - VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); - folio->_folio_dtor = compound_dtor; -} - void destroy_large_folio(struct folio *folio); /* Returns the number of bytes in this potentially compound page. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de5ac95572c8..1c5c2349c18e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). @@ -318,7 +317,6 @@ struct folio { unsigned long _flags_1; unsigned long _head_1; /* public: */ - unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e9e7cc45352d..85d54b6c9e0b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,6 +190,7 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, PG_hugetlb = PG_active, + PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -806,6 +807,9 @@ static inline void ClearPageCompound(struct page *page) BUG_ON(!PageHead(page)); ClearPageHead(page); } +PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) +#else +TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #endif #define PG_head_mask ((1UL << PG_head)) @@ -1077,7 +1081,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ + 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index dd5f87047d06..934dd86e19f5 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(folio, _folio_dtor); VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5817bf77f1f0..2fe1ea187b6b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -581,7 +581,7 @@ void folio_prep_large_rmappable(struct folio *folio) { VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); - folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); + folio_set_large_rmappable(folio); } static inline bool is_transparent_hugepage(struct page *page) @@ -593,7 +593,7 @@ static inline bool is_transparent_hugepage(struct page *page) folio = page_folio(page); return is_huge_zero_page(&folio->page) || - folio->_folio_dtor == TRANSHUGE_PAGE_DTOR; + folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, diff --git a/mm/internal.h b/mm/internal.h index 30bbfcacc909..5c0daea731f3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -419,7 +419,6 @@ static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d96dc6a3077a..452459836b71 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -572,9 +572,6 @@ static inline void free_the_page(struct page *page, unsigned int order) * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * The first tail page's ->compound_dtor describes how to destroy the - * compound page. - * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -593,14 +590,12 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio->_folio_dtor; - if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } - if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) + if (folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); -- cgit v1.2.3 From ebc1baf5c9b46c2240c580a2fd992b2e48606dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:58 +0100 Subject: mm: free up a word in the first tail page Store the folio order in the low byte of the flags word in the first tail page. This frees up the word that was being used to store the order and dtor bytes previously. Link: https://lkml.kernel.org/r/20230816151201.3655946-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- include/linux/mm_types.h | 3 +-- include/linux/page-flags.h | 7 ++++--- kernel/crash_core.c | 1 - mm/internal.h | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d573781973..939386e0aeda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1000,7 +1000,7 @@ struct inode; * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be - * set before _folio_order is initialised, or this may be a tail page. + * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) @@ -1009,7 +1009,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } /** @@ -1025,7 +1025,7 @@ static inline unsigned int folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } #include @@ -1996,7 +1996,7 @@ static inline long folio_nr_pages(struct folio *folio) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } @@ -2014,7 +2014,7 @@ static inline unsigned long compound_nr(struct page *page) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c5c2349c18e..40afb3bbc309 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). @@ -316,8 +315,8 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + unsigned long _folio_avail; /* public: */ - unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 46fc05c648ff..638b0a96b4c5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -184,7 +184,8 @@ enum pageflags { /* * Flags only valid for compound pages. Stored in first tail page's - * flags word. + * flags word. Cannot use the first 8 flags or any flag marked as + * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ @@ -1081,8 +1082,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ - 1UL << PG_large_rmappable) + (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ + 1UL << PG_hugetlb | 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 934dd86e19f5..693445e1f7f6 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); diff --git a/mm/internal.h b/mm/internal.h index 5c0daea731f3..d1d4bf4e63c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -407,7 +407,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; - folio->_folio_order = order; + folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif -- cgit v1.2.3 From 6199277baf73a4877b42991b97c40e173e530756 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:59 +0100 Subject: mm: remove folio_test_transhuge() This function is misleading; people think it means "Is this a THP", when all it actually does is check whether this is a large folio. Remove it; the one remaining user should have been checking to see whether the folio is PMD sized or not. Link: https://lkml.kernel.org/r/20230816151201.3655946-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- mm/memcontrol.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 638b0a96b4c5..5c02720c53a5 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -853,11 +853,6 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } -static inline bool folio_test_transhuge(struct folio *folio) -{ - return folio_test_head(folio); -} - /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e125aa5a18d..de6b40f85113 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5755,7 +5755,7 @@ static int mem_cgroup_move_account(struct page *page, if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (folio_test_transhuge(folio)) { + if (folio_test_pmd_mappable(folio)) { __mod_lruvec_state(from_vec, NR_ANON_THPS, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_THPS, -- cgit v1.2.3 From a644b0abbfe1d7cf775082cafdcc7b5f3c35becf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:12:01 +0100 Subject: mm: convert split_huge_pages_pid() to use a folio Replaces five calls to compound_head with one. Link: https://lkml.kernel.org/r/20230816151201.3655946-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/huge_memory.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2fe1ea187b6b..213bb1e33830 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -584,14 +584,11 @@ void folio_prep_large_rmappable(struct folio *folio) folio_set_large_rmappable(folio); } -static inline bool is_transparent_hugepage(struct page *page) +static inline bool is_transparent_hugepage(struct folio *folio) { - struct folio *folio; - - if (!PageCompound(page)) + if (!folio_test_large(folio)) return false; - folio = page_folio(page); return is_huge_zero_page(&folio->page) || folio_test_large_rmappable(folio); } @@ -3012,6 +3009,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; + struct folio *folio; if (!vma) break; @@ -3028,22 +3026,23 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (IS_ERR_OR_NULL(page)) continue; - if (!is_transparent_hugepage(page)) + folio = page_folio(page); + if (!is_transparent_hugepage(folio)) goto next; total++; - if (!can_split_folio(page_folio(page), NULL)) + if (!can_split_folio(folio, NULL)) goto next; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(page)) + if (!split_folio(folio)) split++; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); cond_resched(); } mmap_read_unlock(mm); -- cgit v1.2.3 From 6c1419730822fe991fc15bfd7059f6872a71a7af Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 22 Aug 2023 15:30:43 -0700 Subject: hugetlb: clear flags in tail pages that will be freed individually hugetlb manually creates and destroys compound pages. As such it makes assumptions about struct page layout. Commit ebc1baf5c9b4 ("mm: free up a word in the first tail page") breaks hugetlb. The following will fix the breakage. Link: https://lkml.kernel.org/r/20230822231741.GC4509@monkey Fixes: ebc1baf5c9b4 ("mm: free up a word in the first tail page") Signed-off-by: Mike Kravetz Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82c3104337e..cbc25826c9b0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1484,6 +1484,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); + p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; p->mapping = NULL; clear_compound_head(p); if (!demote) @@ -1702,8 +1703,6 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i; - struct page *subpage; bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1745,14 +1744,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, spin_unlock_irq(&hugetlb_lock); } - for (i = 0; i < pages_per_huge_page(h); i++) { - subpage = folio_page(folio, i); - subpage->flags &= ~(1 << PG_locked | 1 << PG_error | - 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_private | - 1 << PG_writeback); - } - /* * Non-gigantic pages demoted from CMA allocated gigantic pages * need to be given back to CMA in free_gigantic_folio. -- cgit v1.2.3 From a98460494b16db9c377e55bc13e5407a0eb79fe8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Aug 2023 12:51:20 -0700 Subject: mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() thought it had emptied: page lock on the huge page is enough to protect against WP faults (which find the PTE has been cleared), but not enough to protect against userfaultfd. "BUG: Bad rss-counter state" followed. retract_page_tables() protects against this by checking !vma->anon_vma; but we know that MADV_COLLAPSE needs to be able to work on private shmem mappings, even those with an anon_vma prepared for another part of the mapping; and we know that MADV_COLLAPSE needs to work on shared shmem mappings which are userfaultfd_armed(). Whether it needs to work on private shmem mappings which are userfaultfd_armed(), I'm not so sure: but assume that it does. Just for this case, take the pmd_lock() two steps earlier: not because it gives any protection against this case itself, but because ptlock nests inside it, and it's the dropping of ptlock which let the bug in. In other cases, continue to minimize the pmd_lock() hold time. Link: https://lkml.kernel.org/r/4d31abf5-56c0-9f3d-d12f-c9317936691@google.com Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()") Signed-off-by: Hugh Dickins Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5FW-kYD5Rg5xo_RDBM0LTTqZQ@mail.gmail.com/ Acked-by: Peter Xu Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/khugepaged.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40d43eccdee8..d5650541083a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1476,7 +1476,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; - spinlock_t *pml, *ptl; + spinlock_t *pml = NULL, *ptl; int nr_ptes = 0, result = SCAN_FAIL; int i; @@ -1572,9 +1572,25 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + + /* + * pmd_lock covers a wider range than ptl, and (if split from mm's + * page_table_lock) ptl nests inside pml. The less time we hold pml, + * the better; but userfaultfd's mfill_atomic_pte() on a private VMA + * inserts a valid as-if-COWed PTE without even looking up page cache. + * So page lock of hpage does not protect from it, so we must not drop + * ptl before pgt_pmd is removed, so uffd private needs pml taken now. + */ + if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) + pml = pmd_lock(mm, pmd); + + start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; + if (!pml) + spin_lock(ptl); + else if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; @@ -1608,7 +1624,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, nr_ptes++; } - pte_unmap_unlock(start_pte, ptl); + pte_unmap(start_pte); + if (!pml) + spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { @@ -1616,12 +1634,12 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); } - /* step 4: remove page table */ - - /* Huge page lock is still held, so page table must remain empty */ - pml = pmd_lock(mm, pmd); - if (ptl != pml) - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + /* step 4: remove empty page table */ + if (!pml) { + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); if (ptl != pml) @@ -1648,6 +1666,8 @@ abort: } if (start_pte) pte_unmap_unlock(start_pte, ptl); + if (pml && pml != ptl) + spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_hpage: -- cgit v1.2.3 From 08dff2810e8feb3096bf5c8242ab1649d1e8b1a4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 12 Aug 2023 16:56:25 +0100 Subject: mm/memory.c: fix mismerge Fix a build issue. Link: https://lkml.kernel.org/r/ZNerqcNS4EBJA/2v@casper.infradead.org Fixes: 4aaa60dad4d1 ("mm: allow per-VMA locks on file-backed VMAs") Signed-off-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308121909.XNYBtqNI-lkp@intel.com/ Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4a7c8be9fe71..f9c3ad489823 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5457,7 +5457,7 @@ retry: * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (unlikely(!vma->anon_vma && !vma_is_tcp(vma))) + if (vma_is_anonymous(vma) && !vma->anon_vma) goto inval_end_read; /* -- cgit v1.2.3 From d51b68469bc7804c34622f7f3d4889628d37cfd6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 1 Jul 2023 15:28:37 +0800 Subject: mm: memory-failure: fix potential page refcnt leak in memory_failure() put_ref_page() is not called to drop extra refcnt when comes from madvise in the case pfn is valid but pgmap is NULL leading to page refcnt leak. Link: https://lkml.kernel.org/r/20230701072837.1994253-1-linmiaohe@huawei.com Fixes: 1e8aaedb182d ("mm,memory_failure: always pin the page in madvise_inject_error") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 55dfe8a7bf4b..881c35ef1daa 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2117,8 +2117,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, { int rc = -ENXIO; - put_ref_page(pfn, flags); - /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) goto out; @@ -2193,6 +2191,7 @@ int memory_failure(unsigned long pfn, int flags) if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); + put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, pgmap); -- cgit v1.2.3 From b243dcbf2f13856e39e18df3a15a65f6fe33db85 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:52 -0700 Subject: swap: remove remnants of polling from read_swap_cache_async Patch series "Per-VMA lock support for swap and userfaults", v7. When per-VMA locks were introduced in [1] several types of page faults would still fall back to mmap_lock to keep the patchset simple. Among them are swap and userfault pages. The main reason for skipping those cases was the fact that mmap_lock could be dropped while handling these faults and that required additional logic to be implemented. Implement the mechanism to allow per-VMA locks to be dropped for these cases. First, change handle_mm_fault to drop per-VMA locks when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED to be consistent with the way mmap_lock is handled. Then change folio_lock_or_retry to accept vm_fault and return vm_fault_t which simplifies later patches. Finally allow swap and uffd page faults to be handled under per-VMA locks by dropping per-VMA and retrying, the same way it's done under mmap_lock. Naturally, once VMA lock is dropped that VMA should be assumed unstable and can't be used. This patch (of 6): Commit [1] introduced IO polling support duding swapin to reduce swap read latency for block devices that can be polled. However later commit [2] removed polling support. Therefore it seems safe to remove do_poll parameter in read_swap_cache_async and always call swap_readpage with synchronous=false waiting for IO completion in folio_lock_or_retry. [1] commit 23955622ff8d ("swap: add block io poll in swapin path") [2] commit 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio") Link: https://lkml.kernel.org/r/20230630211957.1341547-1-surenb@google.com Link: https://lkml.kernel.org/r/20230630211957.1341547-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Peter Xu Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/madvise.c | 4 ++-- mm/swap.h | 1 - mm/swap_state.c | 12 +++++------- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index b1f53a95e3a5..4dded5d27e7e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -217,7 +217,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, ptep = NULL; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, addr, false, &splug); + vma, addr, &splug); if (page) put_page(page); } @@ -262,7 +262,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma, rcu_read_unlock(); page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), - vma, addr, false, &splug); + vma, addr, &splug); if (page) put_page(page); diff --git a/mm/swap.h b/mm/swap.h index 7c033d793f15..8a3c7a0ace4f 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -46,7 +46,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, - bool do_poll, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, diff --git a/mm/swap_state.c b/mm/swap_state.c index d157862ba0a6..01f15139b7d9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -526,15 +526,14 @@ fail_put_swap: */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, - unsigned long addr, bool do_poll, - struct swap_iocb **plug) + unsigned long addr, struct swap_iocb **plug) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage, do_poll, plug); + swap_readpage(retpage, false, plug); return retpage; } @@ -629,7 +628,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; - bool do_poll = true, page_allocated; + bool page_allocated; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; @@ -637,7 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; - do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -669,7 +667,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); + return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -837,7 +835,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - ra_info.win == 1, NULL); + NULL); } /** -- cgit v1.2.3 From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- mm/memory.c | 12 ++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc6552..2e5d1e238af9 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0..b1723094d464 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48c..6115d7514972 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e..a063774ba584 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3..2e861b9360c7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/mm/memory.c b/mm/memory.c index f9c3ad489823..b9c3780fd426 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3747,6 +3747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ret = VM_FAULT_RETRY; + vma_end_read(vma); goto out; } @@ -5248,6 +5249,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } +#ifdef CONFIG_PER_VMA_LOCK + /* + * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of + * the assumption that lock is dropped on VM_FAULT_RETRY. + */ + if (WARN_ON_ONCE((*flags & + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) + return VM_FAULT_SIGSEGV; +#endif + return 0; } -- cgit v1.2.3 From fdc724d6aa44efd75cc9b6a3c3900baac44bc50a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:55 -0700 Subject: mm: change folio_lock_or_retry to use vm_fault directly Change folio_lock_or_retry to accept vm_fault struct and return the vm_fault_t directly. Link: https://lkml.kernel.org/r/20230630211957.1341547-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 11 ++++++----- mm/filemap.c | 22 ++++++++++++---------- mm/memory.c | 14 ++++++-------- 3 files changed, 24 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f4f24b594cd7..437e4526028c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -916,8 +916,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); -bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, - unsigned int flags); +vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); @@ -1021,11 +1020,13 @@ static inline int folio_lock_killable(struct folio *folio) * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool folio_lock_or_retry(struct folio *folio, - struct mm_struct *mm, unsigned int flags) +static inline vm_fault_t folio_lock_or_retry(struct folio *folio, + struct vm_fault *vmf) { might_sleep(); - return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); + if (!folio_trylock(folio)) + return __folio_lock_or_retry(folio, vmf); + return 0; } /* diff --git a/mm/filemap.c b/mm/filemap.c index dd022b065614..40514493014a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1669,32 +1669,34 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) /* * Return values: - * true - folio is locked; mmap_lock is still held. - * false - folio is not locked. + * 0 - folio is locked. + * non-zero - folio is not locked. * mmap_lock has been released (mmap_read_unlock(), unless flags had both * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in * which case mmap_lock is still held. * - * If neither ALLOW_RETRY nor KILLABLE are set, will always return true + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock unperturbed. */ -bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, - unsigned int flags) +vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned int flags = vmf->flags; + if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock is not released - * even though return 0. + * even though return VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) - return false; + return VM_FAULT_RETRY; mmap_read_unlock(mm); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); - return false; + return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; @@ -1702,13 +1704,13 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, ret = __folio_lock_killable(folio); if (ret) { mmap_read_unlock(mm); - return false; + return VM_FAULT_RETRY; } } else { __folio_lock(folio); } - return true; + return 0; } /** diff --git a/mm/memory.c b/mm/memory.c index b9c3780fd426..080e1d59d752 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3599,6 +3599,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; + vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold @@ -3611,9 +3612,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_try_get(folio)) return 0; - if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + ret = folio_lock_or_retry(folio, vmf); + if (ret) { folio_put(folio); - return VM_FAULT_RETRY; + return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3738,7 +3740,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) bool exclusive = false; swp_entry_t entry; pte_t pte; - int locked; vm_fault_t ret = 0; void *shadow = NULL; @@ -3861,12 +3862,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); - - if (!locked) { - ret |= VM_FAULT_RETRY; + ret |= folio_lock_or_retry(folio, vmf); + if (ret & VM_FAULT_RETRY) goto out_release; - } if (swapcache) { /* -- cgit v1.2.3 From 1235ccd05b6dd6970ff50baea99aa994023fbc4a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:56 -0700 Subject: mm: handle swap page faults under per-VMA lock When page fault is handled under per-VMA lock protection, all swap page faults are retried with mmap_lock because folio_lock_or_retry has to drop and reacquire mmap_lock if folio could not be immediately locked. Follow the same pattern as mmap_lock to drop per-VMA lock when waiting for folio and retrying once folio is available. With this obstacle removed, enable do_swap_page to operate under per-VMA lock protection. Drivers implementing ops->migrate_to_ram might still rely on mmap_lock, therefore we have to fall back to mmap_lock in that particular case. Note that the only time do_swap_page calls synchronous swap_readpage is when SWP_SYNCHRONOUS_IO is set, which is only set for QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and pmem). Therefore we don't sleep in this path, and there's no need to drop the mmap or per-VMA lock. Link: https://lkml.kernel.org/r/20230630211957.1341547-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Alistair Popple Reviewed-by: Alistair Popple Acked-by: Peter Xu Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++++ mm/filemap.c | 17 ++++++++--------- mm/memory.c | 16 ++++++++++------ 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 939386e0aeda..0d16208178c7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) vma->detached = detached; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_end_read(vmf->vma); + else + mmap_read_unlock(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + mmap_read_unlock(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/mm/filemap.c b/mm/filemap.c index 40514493014a..8040545954bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1671,27 +1671,26 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) * Return values: * 0 - folio is locked. * non-zero - folio is not locked. - * mmap_lock has been released (mmap_read_unlock(), unless flags had both - * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in - * which case mmap_lock is still held. + * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or + * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 - * with the folio locked and the mmap_lock unperturbed. + * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { - struct mm_struct *mm = vmf->vma->vm_mm; unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* - * CAUTION! In this case, mmap_lock is not released - * even though return VM_FAULT_RETRY. + * CAUTION! In this case, mmap_lock/per-VMA lock is not + * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; - mmap_read_unlock(mm); + release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else @@ -1703,7 +1702,7 @@ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) ret = __folio_lock_killable(folio); if (ret) { - mmap_read_unlock(mm); + release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { diff --git a/mm/memory.c b/mm/memory.c index 080e1d59d752..5748a41c164c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3746,12 +3746,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - ret = VM_FAULT_RETRY; - vma_end_read(vma); - goto out; - } - entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { @@ -3761,6 +3755,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + /* + * migrate_to_ram is not yet ready to operate + * under VMA lock. + */ + vma_end_read(vma); + ret = VM_FAULT_RETRY; + goto out; + } + vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); -- cgit v1.2.3 From 29a22b9e08d70d6c9b075c12c47b6e895cb65cf0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:57 -0700 Subject: mm: handle userfaults under VMA lock Enable handle_userfault to operate under VMA lock by releasing VMA lock instead of mmap_lock and retrying. Note that FAULT_FLAG_RETRY_NOWAIT should never be used when handling faults under per-VMA lock protection because that would break the assumption that lock is dropped on retry. [surenb@google.com: fix a lockdep issue in vma_assert_write_locked] Link: https://lkml.kernel.org/r/20230712195652.969194-1-surenb@google.com Link: https://lkml.kernel.org/r/20230630211957.1341547-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 34 ++++++++++++++-------------------- include/linux/mm.h | 20 ++++++++++++++++++++ mm/memory.c | 9 +-------- 3 files changed, 35 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 70bd2951b68d..1091cb461747 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { + struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(ctx->mm); + assert_fault_locked(vmf); - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -308,10 +307,8 @@ out: } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { return false; /* should never get here */ } @@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - unsigned long address, - unsigned long flags, + struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t ptent; bool ret = true; - mmap_assert_locked(mm); + assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -440,7 +437,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * Coredumping runs without mmap_lock so we can only check that * the mmap_lock is held, if PF_DUMPCORE was not set. */ - mmap_assert_locked(mm); + assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -556,15 +553,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + must_wait = userfaultfd_must_wait(ctx, vmf, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vma, - vmf->address, - vmf->flags, reason); + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); - mmap_read_unlock(mm); + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d16208178c7..c1db400e83cb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -679,6 +679,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) rcu_read_unlock(); } +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -721,6 +722,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + if (!rwsem_is_locked(&vma->vm_lock->lock)) + vma_assert_write_locked(vma); +} + static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ @@ -737,6 +744,14 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_assert_locked(vmf->vma); + else + mmap_assert_locked(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -762,6 +777,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + mmap_assert_locked(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/mm/memory.c b/mm/memory.c index 5748a41c164c..2c6f45d18b73 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5471,14 +5471,7 @@ retry: * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (vma_is_anonymous(vma) && !vma->anon_vma) - goto inval_end_read; - - /* - * Due to the possibility of userfault handler dropping mmap_lock, avoid - * it for now and fall back to page fault handling under mmap_lock. - */ - if (userfaultfd_armed(vma)) + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ -- cgit v1.2.3 From f82e6bf9bb9b6e1dc001320a88eee67d7ac31e96 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 26 Jul 2023 15:32:23 +0000 Subject: mm: memcg: use rstat for non-hierarchical stats Currently, memcg uses rstat to maintain aggregated hierarchical stats. Counters are maintained for hierarchical stats at each memcg. Rstat tracks which cgroups have updates on which cpus to keep those counters fresh on the read-side. Non-hierarchical stats are currently not covered by rstat. Their per-cpu counters are summed up on every read, which is expensive. The original implementation did the same. At some point before rstat, non-hierarchical aggregated counters were introduced by commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting"). However, those counters were updated on the performance critical write-side, which caused regressions, so they were later removed by commit 815744d75152 ("mm: memcontrol: don't batch updates of local VM stats and events"). See [1] for more detailed history. Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a half) enjoyed cheap reads of non-hierarchical stats, specifically on cgroup v1. When moving to more recent kernels, a performance regression for reading non-hierarchical stats is observed. Now that we have rstat, we know exactly which percpu counters have updates for each stat. We can maintain non-hierarchical counters again, making reads much more efficient, without affecting the performance critical write-side. Hence, add non-hierarchical (i.e local) counters for the stats, and extend rstat flushing to keep those up-to-date. A caveat is that we now need a stats flush before reading local/non-hierarchical stats through {memcg/lruvec}_page_state_local() or memcg_events_local(), where we previously only needed a flush to read hierarchical stats. Most contexts reading non-hierarchical stats are already doing a flush, add a flush to the only missing context in count_shadow_nodes(). With this patch, reading memory.stat from 1000 memcgs is 3x faster on a machine with 256 cpus on cgroup v1: # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done # time cat /sys/fs/cgroup/memory/cg*/memory.stat > /dev/null real 0m0.125s user 0m0.005s sys 0m0.120s After: real 0m0.032s user 0m0.005s sys 0m0.027s To make sure there are no regressions on cgroup v2, I ran an artificial reclaim/refault stress test [2] that creates (NR_CPUS * 2) cgroups, assigns them limits, runs a worker process in each cgroup that allocates tmpfs memory equal to quadruple the limit (to invoke reclaim continuously), and then reads back the entire file (to invoke refaults). All workers are run in parallel, and zram is used as a swapping backend. Both reclaim and refault have conditional stats flushing. I ran this on a machine with 112 cpus, once on mm-unstable, and once on mm-unstable with this patch reverted. (1) A few runs without this patch: # time ./stress_reclaim_refault.sh real 0m9.949s user 0m0.496s sys 14m44.974s # time ./stress_reclaim_refault.sh real 0m10.049s user 0m0.486s sys 14m55.791s # time ./stress_reclaim_refault.sh real 0m9.984s user 0m0.481s sys 14m53.841s (2) A few runs with this patch: # time ./stress_reclaim_refault.sh real 0m9.885s user 0m0.486s sys 14m48.753s # time ./stress_reclaim_refault.sh real 0m9.903s user 0m0.495s sys 14m48.339s # time ./stress_reclaim_refault.sh real 0m9.861s user 0m0.507s sys 14m49.317s No regressions are observed with this patch. There is actually a very slight improvement. If I have to guess, maybe it's because we avoid the percpu loop in count_shadow_nodes() when calling lruvec_page_state_local(), but I could not prove this using perf, it's probably in the noise. [1] https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ [2] https://lore.kernel.org/lkml/CAJD7tkb17x=qwoO37uxyYXLEUVp15BQKR+Xfh7Sg9Hx-wTQ_=w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230803185046.1385770-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230726153223.821757-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 ++--- mm/memcontrol.c | 67 ++++++++++++++++++++++++++-------------------- mm/workingset.c | 1 + 3 files changed, 43 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 163004ae3349..11810a2cfd2d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,6 +111,9 @@ struct lruvec_stats { /* Aggregated (CPU and subtree) state */ long state[NR_VM_NODE_STAT_ITEMS]; + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_VM_NODE_STAT_ITEMS]; + /* Pending child counts during tree propagation */ long state_pending[NR_VM_NODE_STAT_ITEMS]; }; @@ -1018,14 +1021,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; long x = 0; - int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); + x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index de6b40f85113..cf57fe9318d5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -742,6 +742,10 @@ struct memcg_vmstats { long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; + /* Non-hierarchical (CPU aggregated) page state & events */ + long state_local[MEMCG_NR_STAT]; + unsigned long events_local[NR_MEMCG_EVENTS]; + /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; @@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = 0; - int cpu; + long x = READ_ONCE(memcg->vmstats->state_local[idx]); - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - long x = 0; - int cpu; int index = memcg_events_index(event); if (index < 0) return 0; - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[index], cpu); - return x; + return READ_ONCE(memcg->vmstats->events_local[index]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -5516,7 +5513,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; - long delta, v; + long delta, delta_cpu, v; int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5532,19 +5529,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ + delta_cpu = 0; v = READ_ONCE(statc->state[i]); if (v != statc->state_prev[i]) { - delta += v - statc->state_prev[i]; + delta_cpu = v - statc->state_prev[i]; + delta += delta_cpu; statc->state_prev[i] = v; } - if (!delta) - continue; - /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats->state[i] += delta; - if (parent) - parent->vmstats->state_pending[i] += delta; + if (delta_cpu) + memcg->vmstats->state_local[i] += delta_cpu; + + if (delta) { + memcg->vmstats->state[i] += delta; + if (parent) + parent->vmstats->state_pending[i] += delta; + } } for (i = 0; i < NR_MEMCG_EVENTS; i++) { @@ -5552,18 +5553,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) memcg->vmstats->events_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { - delta += v - statc->events_prev[i]; + delta_cpu = v - statc->events_prev[i]; + delta += delta_cpu; statc->events_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + memcg->vmstats->events_local[i] += delta_cpu; - memcg->vmstats->events[i] += delta; - if (parent) - parent->vmstats->events_pending[i] += delta; + if (delta) { + memcg->vmstats->events[i] += delta; + if (parent) + parent->vmstats->events_pending[i] += delta; + } } for_each_node_state(nid, N_MEMORY) { @@ -5581,18 +5586,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) pn->lruvec_stats.state_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(lstatc->state[i]); if (v != lstatc->state_prev[i]) { - delta += v - lstatc->state_prev[i]; + delta_cpu = v - lstatc->state_prev[i]; + delta += delta_cpu; lstatc->state_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + pn->lruvec_stats.state_local[i] += delta_cpu; - pn->lruvec_stats.state[i] += delta; - if (ppn) - ppn->lruvec_stats.state_pending[i] += delta; + if (delta) { + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } } } } diff --git a/mm/workingset.c b/mm/workingset.c index 4686ae363000..da58a26d0d4d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; + mem_cgroup_flush_stats(); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, -- cgit v1.2.3 From a379322022c0961fe0b638cdd842d3c38eeff92c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:30 +0100 Subject: mm: convert page_table_check_pte_set() to page_table_check_ptes_set() Tell the page table check how many PTEs & PFNs we want it to check. Link: https://lkml.kernel.org/r/20230802151406.3735276-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 13 +++++++------ mm/page_table_check.c | 16 +++++++++------- 5 files changed, 19 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fe4b913589ee..445b18d7a47c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 44377f0d7c35..01e4aabc8898 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, 1); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada1bbf12961..cd0b6337d03c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 7f6b9bf926c5..6722941c7cb8 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,7 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -64,13 +65,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, ptep, pte); + __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, @@ -123,8 +124,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 46e77c12c81e..af69c3c8f7c2 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,18 +182,20 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr) { + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, ptep_get(ptep)); - if (pte_user_accessible_page(pte)) { - page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, - pte_write(pte)); - } + for (i = 0; i < nr; i++) + __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + if (pte_user_accessible_page(pte)) + page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } -EXPORT_SYMBOL(__page_table_check_pte_set); +EXPORT_SYMBOL(__page_table_check_ptes_set); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { -- cgit v1.2.3 From 29d26f1215de14721188988a59b1426abb85b7be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:33 +0100 Subject: mm: remove ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO Current best practice is to reuse the name of the function as a define to indicate that the function is implemented by the architecture. Link: https://lkml.kernel.org/r/20230802151406.3735276-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- Documentation/core-api/cachetlb.rst | 24 +++++++++--------------- include/linux/cacheflush.h | 4 ++-- mm/util.c | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index b645947954fb..889fc84ccd1b 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -273,7 +273,7 @@ maps this page at its virtual address. If D-cache aliasing is not an issue, these two routines may simply call memcpy/memset directly and do nothing more. - ``void flush_dcache_page(struct page *page)`` + ``void flush_dcache_folio(struct folio *folio)`` This routines must be called when: @@ -281,7 +281,7 @@ maps this page at its virtual address. and / or in high memory b) the kernel is about to read from a page cache page and user space shared/writable mappings of this page potentially exist. Note - that {get,pin}_user_pages{_fast} already call flush_dcache_page + that {get,pin}_user_pages{_fast} already call flush_dcache_folio on any page found in the user address space and thus driver code rarely needs to take this into account. @@ -295,7 +295,7 @@ maps this page at its virtual address. The phrase "kernel writes to a page cache page" means, specifically, that the kernel executes store instructions that dirty data in that - page at the page->virtual mapping of that page. It is important to + page at the kernel virtual mapping of that page. It is important to flush here to handle D-cache aliasing, to make sure these kernel stores are visible to user space mappings of that page. @@ -306,18 +306,18 @@ maps this page at its virtual address. If D-cache aliasing is not an issue, this routine may simply be defined as a nop on that architecture. - There is a bit set aside in page->flags (PG_arch_1) as "architecture + There is a bit set aside in folio->flags (PG_arch_1) as "architecture private". The kernel guarantees that, for pagecache pages, it will clear this bit when such a page first enters the pagecache. This allows these interfaces to be implemented much more efficiently. It allows one to "defer" (perhaps indefinitely) the actual flush if there are currently no user processes mapping this - page. See sparc64's flush_dcache_page and update_mmu_cache_range + page. See sparc64's flush_dcache_folio and update_mmu_cache_range implementations for an example of how to go about doing this. - The idea is, first at flush_dcache_page() time, if - page_file_mapping() returns a mapping, and mapping_mapped on that + The idea is, first at flush_dcache_folio() time, if + folio_flush_mapping() returns a mapping, and mapping_mapped() on that mapping returns %false, just mark the architecture private page flag bit. Later, in update_mmu_cache_range(), a check is made of this flag bit, and if set the flush is done and the flag bit @@ -331,12 +331,6 @@ maps this page at its virtual address. dirty. Again, see sparc64 for examples of how to deal with this. - ``void flush_dcache_folio(struct folio *folio)`` - This function is called under the same circumstances as - flush_dcache_page(). It allows the architecture to - optimise for flushing the entire folio of pages instead - of flushing one page at a time. - ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len)`` ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page, @@ -357,7 +351,7 @@ maps this page at its virtual address. When the kernel needs to access the contents of an anonymous page, it calls this function (currently only - get_user_pages()). Note: flush_dcache_page() deliberately + get_user_pages()). Note: flush_dcache_folio() deliberately doesn't work for an anonymous page. The default implementation is a nop (and should remain so for all coherent architectures). For incoherent architectures, it should flush @@ -374,7 +368,7 @@ maps this page at its virtual address. ``void flush_icache_page(struct vm_area_struct *vma, struct page *page)`` All the functionality of flush_icache_page can be implemented in - flush_dcache_page and update_mmu_cache_range. In the future, the hope + flush_dcache_folio and update_mmu_cache_range. In the future, the hope is to remove this interface completely. The final category of APIs is for I/O to deliberately aliased address diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h index a6189d21f2ba..82136f3fcf54 100644 --- a/include/linux/cacheflush.h +++ b/include/linux/cacheflush.h @@ -7,14 +7,14 @@ struct folio; #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +#ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio); #endif #else static inline void flush_dcache_folio(struct folio *folio) { } -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0 +#define flush_dcache_folio flush_dcache_folio #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* _LINUX_CACHEFLUSH_H */ diff --git a/mm/util.c b/mm/util.c index 5e9305189c3f..cde229b05eb3 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1119,7 +1119,7 @@ void page_offline_end(void) } EXPORT_SYMBOL(page_offline_end); -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +#ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); -- cgit v1.2.3 From 9f1f5b60e76d44fa85fef6970b7477f72d3999eb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:01 +0100 Subject: mm: use flush_icache_pages() in do_set_pmd() Push the iteration over each page down to the architectures (many can flush the entire THP without iteration). Link: https://lkml.kernel.org/r/20230802151406.3735276-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 2c6f45d18b73..fbb7f066bfb6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4265,7 +4265,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; - int i; vm_fault_t ret = VM_FAULT_FALLBACK; if (!transhuge_vma_suitable(vma, haddr)) @@ -4298,8 +4297,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (unlikely(!pmd_none(*vmf->pmd))) goto out; - for (i = 0; i < HPAGE_PMD_NR; i++) - flush_icache_page(vma, page + i); + flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) -- cgit v1.2.3 From de74976eb65151a2f568e477fc2e0032df5b22b4 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:02 +0100 Subject: filemap: add filemap_map_folio_range() filemap_map_folio_range() maps partial/full folio. Comparing to original filemap_map_pages(), it updates refcount once per folio instead of per page and gets minor performance improvement for large folio. With a will-it-scale.page_fault3 like app (change file write fault testing to read fault testing. Trying to upstream it to will-it-scale at [1]), got 2% performance gain on a 48C/96T Cascade Lake test box with 96 processes running against xfs. [1]: https://github.com/antonblanchard/will-it-scale/pull/37 Link: https://lkml.kernel.org/r/20230802151406.3735276-35-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 109 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 8040545954bc..bdc1e0b811bf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2168,16 +2168,6 @@ out: } EXPORT_SYMBOL(filemap_get_folios); -static inline -bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) -{ - if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - return false; - if (index >= max) - return false; - return index < folio_next_index(folio) - 1; -} - /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search @@ -3436,10 +3426,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, return false; } -static struct folio *next_uptodate_page(struct folio *folio, - struct address_space *mapping, - struct xa_state *xas, pgoff_t end_pgoff) +static struct folio *next_uptodate_folio(struct xa_state *xas, + struct address_space *mapping, pgoff_t end_pgoff) { + struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { @@ -3477,20 +3467,51 @@ skip: return NULL; } -static inline struct folio *first_map_page(struct address_space *mapping, - struct xa_state *xas, - pgoff_t end_pgoff) +/* + * Map page range [start_page, start_page + nr_pages) of folio. + * start_page is gotten from start by folio_page(folio, start) + */ +static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, + struct folio *folio, unsigned long start, + unsigned long addr, unsigned int nr_pages) { - return next_uptodate_page(xas_find(xas, end_pgoff), - mapping, xas, end_pgoff); -} + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; + struct page *page = folio_page(folio, start); + unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + unsigned int ref_count = 0, count = 0; -static inline struct folio *next_map_page(struct address_space *mapping, - struct xa_state *xas, - pgoff_t end_pgoff) -{ - return next_uptodate_page(xas_next_entry(xas, end_pgoff), - mapping, xas, end_pgoff); + do { + if (PageHWPoison(page)) + continue; + + if (mmap_miss > 0) + mmap_miss--; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit the + * fault-around logic. + */ + if (!pte_none(*vmf->pte)) + continue; + + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + ref_count++; + do_set_pte(vmf, page, addr); + update_mmu_cache(vma, addr, vmf->pte); + } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); + + /* Restore the vmf->pte */ + vmf->pte -= nr_pages; + + folio_ref_add(folio, ref_count); + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + + return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, @@ -3503,12 +3524,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; - struct page *page; - unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; + int nr_pages = 0; rcu_read_lock(); - folio = first_map_page(mapping, &xas, end_pgoff); + folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; @@ -3525,17 +3545,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } do { -again: - page = folio_file_page(folio, xas.xa_index); - if (PageHWPoison(page)) - goto unlock; - - if (mmap_miss > 0) - mmap_miss--; + unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; + end = folio->index + folio_nr_pages(folio) - 1; + nr_pages = min(end, end_pgoff) - xas.xa_index + 1; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3545,32 +3561,17 @@ again: if (!pte_none(ptep_get(vmf->pte))) goto unlock; - /* We're about to handle the fault */ - if (vmf->address == addr) - ret = VM_FAULT_NOPAGE; + ret |= filemap_map_folio_range(vmf, folio, + xas.xa_index - folio->index, addr, nr_pages); - do_set_pte(vmf, page, addr); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, addr, vmf->pte); - if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; - } - folio_unlock(folio); - continue; unlock: - if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { - xas.xa_index++; - goto again; - } folio_unlock(folio); folio_put(folio); - } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); + folio = next_uptodate_folio(&xas, mapping, end_pgoff); + } while (folio); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); -- cgit v1.2.3 From 86f35f69db8e7d169c36472a349507ab0a461f49 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:03 +0100 Subject: rmap: add folio_add_file_rmap_range() folio_add_file_rmap_range() allows to add pte mapping to a specific range of file folio. Comparing to page_add_file_rmap(), it batched updates __lruvec_stat for large folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-36-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 ++ mm/rmap.c | 60 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b87d01660412..a3825ce81102 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, + struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); diff --git a/mm/rmap.c b/mm/rmap.c index 51ec8aa5e61f..1f04debdc87a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1294,31 +1294,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } /** - * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to + * folio_add_file_rmap_range - add pte mapping to page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @nr_pages: The number of pages which will be mapped * @vma: the vm area in which the mapping is added * @compound: charge the page as compound or small page * + * The page range of folio is defined by [first_page, first_page + nr_pages) + * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, - bool compound) +void folio_add_file_rmap_range(struct folio *folio, struct page *page, + unsigned int nr_pages, struct vm_area_struct *vma, + bool compound) { - struct folio *folio = page_folio(page); atomic_t *mapped = &folio->_nr_pages_mapped; - int nr = 0, nr_pmdmapped = 0; - bool first; + unsigned int nr_pmdmapped = 0, first; + int nr = 0; - VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { - first = atomic_inc_and_test(&page->_mapcount); - nr = first; - if (first && folio_test_large(folio)) { - nr = atomic_inc_return_relaxed(mapped); - nr = (nr < COMPOUND_MAPPED); - } + do { + first = atomic_inc_and_test(&page->_mapcount); + if (first && folio_test_large(folio)) { + first = atomic_inc_return_relaxed(mapped); + first = (first < COMPOUND_MAPPED); + } + + if (first) + nr++; + } while (page++, --nr_pages > 0); } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ @@ -1347,6 +1355,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, mlock_vma_folio(folio, vma, compound); } +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) +{ + struct folio *folio = page_folio(page); + unsigned int nr_pages; + + VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); + + if (likely(!compound)) + nr_pages = 1; + else + nr_pages = folio_nr_pages(folio); + + folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from -- cgit v1.2.3 From 3bd786f76de2e01745f462844fd1a206052ee8b8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:04 +0100 Subject: mm: convert do_set_pte() to set_pte_range() set_pte_range() allows to setup page table entries for a specific range. It takes advantage of batched rmap update for large folio. It now takes care of calling update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-37-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 2 +- include/linux/mm.h | 3 ++- mm/filemap.c | 3 +-- mm/memory.c | 37 +++++++++++++++++++++++------------ 4 files changed, 28 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ed148919e11a..211a03053992 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -661,7 +661,7 @@ locked. The VM will unlock the page. Filesystem should find and map pages associated with offsets from "start_pgoff" till "end_pgoff". ->map_pages() is called with the RCU lock held and must not block. If it's not possible to reach a page without blocking, -filesystem should skip it. Filesystem should use do_set_pte() to setup +filesystem should skip it. Filesystem should use set_pte_range() to setup page table entry. Pointer to entry associated with the page is passed in "pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". diff --git a/include/linux/mm.h b/include/linux/mm.h index c1db400e83cb..ddb95967ba64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1322,7 +1322,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index bdc1e0b811bf..c06e9d331416 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3501,8 +3501,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, ret = VM_FAULT_NOPAGE; ref_count++; - do_set_pte(vmf, page, addr); - update_mmu_cache(vma, addr, vmf->pte); + set_pte_range(vmf, folio, page, 1, addr); } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); /* Restore the vmf->pte */ diff --git a/mm/memory.c b/mm/memory.c index fbb7f066bfb6..12b385eaf353 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4330,15 +4330,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +/** + * set_pte_range - Set a range of PTEs to point to pages in a folio. + * @vmf: Fault decription. + * @folio: The folio that contains @page. + * @page: The first page to create a PTE for. + * @nr: The number of PTEs to create. + * @addr: The first address to create a PTE for. + */ +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = vmf->address != addr; + bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; - flush_icache_page(vma, page); + flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) @@ -4352,14 +4361,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr); - lru_cache_add_inactive_or_unevictable(page, vma); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); + VM_BUG_ON_FOLIO(nr != 1, folio); + folio_add_new_anon_rmap(folio, vma, addr); + folio_add_lru_vma(folio, vma); } else { - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + folio_add_file_rmap_range(folio, page, nr, vma, false); } - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) @@ -4427,11 +4440,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { - do_set_pte(vmf, page, vmf->address); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); + struct folio *folio = page_folio(page); + set_pte_range(vmf, folio, page, 1, vmf->address); ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); -- cgit v1.2.3 From 617c28ecab22d98a3809370eb6cb50fa24b7bfe1 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:05 +0100 Subject: filemap: batch PTE mappings Call set_pte_range() once per contiguous range of the folio instead of once per page. This batches the updates to mm counters and the rmap. With a will-it-scale.page_fault3 like app (change file write fault testing to read fault testing. Trying to upstream it to will-it-scale at [1]) got 15% performance gain on a 48C/96T Cascade Lake test box with 96 processes running against xfs. Perf data collected before/after the change: 18.73%--page_add_file_rmap | --11.60%--__mod_lruvec_page_state | |--7.40%--__mod_memcg_lruvec_state | | | --5.58%--cgroup_rstat_updated | --2.53%--__mod_lruvec_state | --1.48%--__mod_node_page_state 9.93%--page_add_file_rmap_range | --2.67%--__mod_lruvec_page_state | |--1.95%--__mod_memcg_lruvec_state | | | --1.57%--cgroup_rstat_updated | --0.61%--__mod_lruvec_state | --0.54%--__mod_node_page_state The running time of __mode_lruvec_page_state() is reduced about 9%. [1]: https://github.com/antonblanchard/will-it-scale/pull/37 Link: https://lkml.kernel.org/r/20230802151406.3735276-38-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index c06e9d331416..014b73eb96a1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3480,11 +3480,12 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct file *file = vma->vm_file; struct page *page = folio_page(folio, start); unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); - unsigned int ref_count = 0, count = 0; + unsigned int count = 0; + pte_t *old_ptep = vmf->pte; do { - if (PageHWPoison(page)) - continue; + if (PageHWPoison(page + count)) + goto skip; if (mmap_miss > 0) mmap_miss--; @@ -3494,20 +3495,34 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) - continue; - - if (vmf->address == addr) - ret = VM_FAULT_NOPAGE; + if (!pte_none(vmf->pte[count])) + goto skip; - ref_count++; - set_pte_range(vmf, folio, page, 1, addr); - } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); + count++; + continue; +skip: + if (count) { + set_pte_range(vmf, folio, page, count, addr); + folio_ref_add(folio, count); + if (in_range(vmf->address, addr, count)) + ret = VM_FAULT_NOPAGE; + } - /* Restore the vmf->pte */ - vmf->pte -= nr_pages; + count++; + page += count; + vmf->pte += count; + addr += count * PAGE_SIZE; + count = 0; + } while (--nr_pages > 0); + + if (count) { + set_pte_range(vmf, folio, page, count, addr); + folio_ref_add(folio, count); + if (in_range(vmf->address, addr, count)) + ret = VM_FAULT_NOPAGE; + } - folio_ref_add(folio, ref_count); + vmf->pte = old_ptep; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; -- cgit v1.2.3 From 5003a2bdf6880dc9c301f555bece1154081158fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:06 +0100 Subject: mm: call update_mmu_cache_range() in more page fault handling paths Pass the vm_fault to the architecture to help it make smarter decisions about which PTEs to insert into the TLB. Link: https://lkml.kernel.org/r/20230802151406.3735276-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 12b385eaf353..9d7fb721a680 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2862,7 +2862,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) - update_mmu_cache(vma, addr, vmf->pte); + update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* @@ -3039,7 +3039,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } @@ -3163,7 +3163,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may @@ -4046,7 +4046,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4170,7 +4170,7 @@ setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4859,7 +4859,7 @@ out_map: if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5030,7 +5030,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vmf->vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vmf->vma, vmf->address, + vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) -- cgit v1.2.3 From cfeed8ffe55b37fa10286aaaa1369da00cb88440 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:46 +0200 Subject: mm/swap: stop using page->private on tail pages for THP_SWAP Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups". This series stops using page->private on tail pages for THP_SWAP, replaces folio->private by folio->swap for swapcache folios, and starts using "new_folio" for tail pages that we are splitting to remove the usage of page->private for swapcache handling completely. This patch (of 4): Let's stop using page->private on tail pages, making it possible to just unconditionally reuse that field in the tail pages of large folios. The remaining usage of the private field for THP_SWAP is in the THP splitting code (mm/huge_memory.c), that we'll handle separately later. Update the THP_SWAP documentation and sanity checks in mm_types.h and __split_huge_page_tail(). [david@redhat.com: stop using page->private on tail pages for THP_SWAP] Link: https://lkml.kernel.org/r/6f0a82a3-6948-20d9-580b-be1dbf415701@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-1-david@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas [arm64] Reviewed-by: Yosry Ahmed Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mteswap.c | 5 +++-- include/linux/mm_types.h | 12 +----------- include/linux/swap.h | 9 +++++++++ mm/huge_memory.c | 15 ++++++--------- mm/memory.c | 2 +- mm/rmap.c | 2 +- mm/swap_state.c | 5 +++-- mm/swapfile.c | 4 ++-- 8 files changed, 26 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index cd508ba80ab1..a31833e3ddc5 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -33,8 +33,9 @@ int mte_save_tags(struct page *page) mte_save_page_tags(page_address(page), tag_storage); - /* page_private contains the swap entry.val set in do_swap_page */ - ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL); + /* lookup the swap entry.val from the page */ + ret = xa_store(&mte_pages, page_swap_entry(page).val, tag_storage, + GFP_KERNEL); if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { mte_free_tag_storage(tag_storage); return xa_err(ret); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b9d8be28361..55cd4bc57b8d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,11 +322,8 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; - /* 4 byte gap here */ - /* private: the union with struct page is transitional */ - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_1; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -347,9 +344,6 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ - unsigned long _avail_2a; - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_2a; }; struct page __page_2; }; @@ -374,9 +368,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -#ifdef CONFIG_64BIT -FOLIO_MATCH(private, _private_1); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ @@ -385,7 +376,6 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); -FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** diff --git a/include/linux/swap.h b/include/linux/swap.h index bb5adc604144..e5cf58a1cf9e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -339,6 +339,15 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline swp_entry_t page_swap_entry(struct page *page) +{ + struct folio *folio = page_folio(page); + swp_entry_t entry = folio_swap_entry(folio); + + entry.val += folio_page_idx(folio, page); + return entry; +} + static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { folio->private = (void *)entry.val; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb4432792b88..a28e9fe16585 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2446,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; /* - * page->private should not be set in tail pages with the exception - * of swap cache pages that store the swp_entry_t in tail pages. - * Fix up and warn once if private is unexpectedly set. - * - * What of 32-bit systems, on which folio->_pincount overlays - * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * pincount must be 0 for folio_ref_freeze() to have succeeded. + * page->private should not be set in tail pages. Fix up and warn once + * if private is unexpectedly set. */ - if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + if (unlikely(page_tail->private)) { + VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } + if (PageSwapCache(head)) + set_page_private(page_tail, (unsigned long)head->private + tail); /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); diff --git a/mm/memory.c b/mm/memory.c index 9d7fb721a680..d104a38e8545 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3879,7 +3879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * changed. */ if (unlikely(!folio_test_swapcache(folio) || - page_private(page) != entry.val)) + page_swap_entry(page).val != entry.val)) goto out_page; /* diff --git a/mm/rmap.c b/mm/rmap.c index 1f04debdc87a..ec7f8e6c9e48 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1647,7 +1647,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(&folio->page)); } else if (folio_test_anon(folio)) { - swp_entry_t entry = { .val = page_private(subpage) }; + swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. diff --git a/mm/swap_state.c b/mm/swap_state.c index 01f15139b7d9..2f2417810052 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -100,6 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, folio_ref_add(folio, nr); folio_set_swapcache(folio); + folio_set_swap_entry(folio, entry); do { xas_lock_irq(&xas); @@ -113,7 +114,6 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, if (shadowp) *shadowp = old; } - set_page_private(folio_page(folio, i), entry.val + i); xas_store(&xas, folio); xas_next(&xas); } @@ -154,9 +154,10 @@ void __delete_from_swap_cache(struct folio *folio, for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); - set_page_private(folio_page(folio, i), 0); xas_next(&xas); } + entry.val = 0; + folio_set_swap_entry(folio, entry); folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); diff --git a/mm/swapfile.c b/mm/swapfile.c index d46933adf789..bd9d904671b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3369,7 +3369,7 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = page_swap_entry(page); return swp_swap_info(entry); } @@ -3384,7 +3384,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; + swp_entry_t swap = page_swap_entry(page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); -- cgit v1.2.3 From 3d2c908768877714a354ee6d7bf93e801400d5e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:48 +0200 Subject: mm/swap: inline folio_set_swap_entry() and folio_swap_entry() Let's simply work on the folio directly and remove the helpers. Link: https://lkml.kernel.org/r/20230821160849.531668-4-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Matthew Wilcox Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/swap.h | 12 +----------- mm/memory.c | 2 +- mm/shmem.c | 6 +++--- mm/swap_state.c | 7 +++---- mm/swapfile.c | 2 +- mm/util.c | 2 +- mm/vmscan.c | 2 +- mm/zswap.c | 4 ++-- 8 files changed, 13 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 352eca0a75bc..493487ed7c38 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -333,25 +333,15 @@ struct swap_info_struct { */ }; -static inline swp_entry_t folio_swap_entry(struct folio *folio) -{ - return folio->swap; -} - static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } -static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) -{ - folio->swap = entry; -} - /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); diff --git a/mm/memory.c b/mm/memory.c index d104a38e8545..421fcef3a3e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3828,7 +3828,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru(folio); /* To provide entry to swap_readpage() */ - folio_set_swap_entry(folio, entry); + folio->swap = entry; swap_readpage(page, true, NULL); folio->private = NULL; } diff --git a/mm/shmem.c b/mm/shmem.c index 99fb60ec2c3d..980289be5f63 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1642,7 +1642,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, int error; old = *foliop; - entry = folio_swap_entry(old); + entry = old->swap; swap_index = swp_offset(entry); swap_mapping = swap_address_space(entry); @@ -1663,7 +1663,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); - folio_set_swap_entry(new, entry); + new->swap = entry; folio_set_swapcache(new); /* @@ -1785,7 +1785,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* We have to do this with folio locked to prevent races */ folio_lock(folio); if (!folio_test_swapcache(folio) || - folio_swap_entry(folio).val != swap.val || + folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; goto unlock; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2f2417810052..b3b14bd0dd64 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -100,7 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, folio_ref_add(folio, nr); folio_set_swapcache(folio); - folio_set_swap_entry(folio, entry); + folio->swap = entry; do { xas_lock_irq(&xas); @@ -156,8 +156,7 @@ void __delete_from_swap_cache(struct folio *folio, VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } - entry.val = 0; - folio_set_swap_entry(folio, entry); + folio->swap.val = 0; folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); @@ -233,7 +232,7 @@ fail: */ void delete_from_swap_cache(struct folio *folio) { - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); diff --git a/mm/swapfile.c b/mm/swapfile.c index bd9d904671b9..e52f486834eb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1536,7 +1536,7 @@ unlock_out: static bool folio_swapped(struct folio *folio) { - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) diff --git a/mm/util.c b/mm/util.c index cde229b05eb3..f31e2ca62cfa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -764,7 +764,7 @@ struct address_space *folio_mapping(struct folio *folio) return NULL; if (unlikely(folio_test_swapcache(folio))) - return swap_address_space(folio_swap_entry(folio)); + return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) diff --git a/mm/vmscan.c b/mm/vmscan.c index c7c149cb8d66..6f13394b112e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1423,7 +1423,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, } if (folio_test_swapcache(folio)) { - swp_entry_t swap = folio_swap_entry(folio); + swp_entry_t swap = folio->swap; if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); diff --git a/mm/zswap.c b/mm/zswap.c index 7300b98d4a03..412b1409a0d7 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1190,7 +1190,7 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { - swp_entry_t swp = folio_swap_entry(folio); + swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; @@ -1370,7 +1370,7 @@ shrink: bool zswap_load(struct folio *folio) { - swp_entry_t swp = folio_swap_entry(folio); + swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; -- cgit v1.2.3 From 07e09c483cbef2a252f75d95670755a0607288f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:49 +0200 Subject: mm/huge_memory: work on folio->swap instead of page->private when splitting folio Let's work on folio->swap instead. While at it, use folio_test_anon() and folio_test_swapcache() -- the original folio remains valid even after splitting (but is then an order-0 folio). We can probably convert a lot more to folios in that code, let's focus on folio->swap handling only for now. Link: https://lkml.kernel.org/r/20230821160849.531668-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28e9fe16585..c4635f750255 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2401,10 +2401,16 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } } -static void __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct folio *folio, int tail, struct lruvec *lruvec, struct list_head *list) { + struct page *head = &folio->page; struct page *page_tail = head + tail; + /* + * Careful: new_folio is not a "real" folio before we cleared PageTail. + * Don't pass it around before clear_compound_head(). + */ + struct folio *new_folio = (struct folio *)page_tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); @@ -2453,8 +2459,8 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } - if (PageSwapCache(head)) - set_page_private(page_tail, (unsigned long)head->private + tail); + if (folio_test_swapcache(folio)) + new_folio->swap.val = folio->swap.val + tail; /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2500,11 +2506,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ split_page_memcg(head, nr); - if (PageAnon(head) && PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - offset = swp_offset(entry); - swap_cache = swap_address_space(entry); + if (folio_test_anon(folio) && folio_test_swapcache(folio)) { + offset = swp_offset(folio->swap); + swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } @@ -2514,7 +2518,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(head, i, lruvec, list); + __split_huge_page_tail(folio, i, lruvec, list); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); @@ -2559,11 +2563,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, remap_page(folio, nr); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - split_swap_cluster(entry); - } + if (folio_test_swapcache(folio)) + split_swap_cluster(folio->swap); for (i = 0; i < nr; i++) { struct page *subpage = head + i; -- cgit v1.2.3 From 14a405c3a933ce261147a60dfb8bf586b45ec9de Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Sat, 19 Aug 2023 08:13:02 +0000 Subject: memcg: remove duplication detection for mem_cgroup_uncharge_swap __mem_cgroup_uncharge_swap is only called in mem_cgroup_uncharge_swap, if mem cgroup is disabled, __mem_cgroup_uncharge_swap cannot be called. Therefore, there is no need to judge whether mem_cgroup is disabled or not. Link: https://lkml.kernel.org/r/20230819081302.1217098-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf57fe9318d5..0286bd0ab043 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7535,9 +7535,6 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); -- cgit v1.2.3 From bb7dbaafff3f582d18028a5b99a8faa789842678 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 19 Aug 2023 04:18:37 +0100 Subject: mm: remove checks for pte_index Since pte_index is always defined, we don't need to check whether it's defined or not. Delete the slow version that doesn't depend on it and remove the #define since nobody needs to test for it. Link: https://lkml.kernel.org/r/20230819031837.3160096-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Cc: Christian Dietrich Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 1 - mm/memory.c | 17 +---------------- 2 files changed, 1 insertion(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc811c9b421a..95ad544ad395 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -63,7 +63,6 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } -#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) diff --git a/mm/memory.c b/mm/memory.c index 421fcef3a3e7..50f44c1bfa19 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1869,7 +1869,6 @@ out: return retval; } -#ifdef pte_index static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { @@ -1884,7 +1883,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, } /* insert_pages() amortizes the cost of spinlock operations - * when inserting pages in a loop. Arch *must* define pte_index. + * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) @@ -1943,7 +1942,6 @@ out: *num = remaining_pages_total; return ret; } -#endif /* ifdef pte_index */ /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. @@ -1963,7 +1961,6 @@ out: int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { -#ifdef pte_index const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) @@ -1975,18 +1972,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); -#else - unsigned long idx = 0, pgcount = *num; - int err = -EINVAL; - - for (; idx < pgcount; ++idx) { - err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); - if (err) - break; - } - *num = pgcount - idx; - return err; -#endif /* ifdef pte_index */ } EXPORT_SYMBOL(vm_insert_pages); -- cgit v1.2.3 From 40d49a3c9e4a0e5cf7a6fcebc8d4d7d63d1f3f1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:34 +0100 Subject: mm: allow ->huge_fault() to be called without the mmap_lock held Remove the checks for the VMA lock being held, allowing the page fault path to call into the filesystem instead of retrying with the mmap_lock held. This will improve scalability for DAX page faults. Also update the documentation to match (and fix some other changes that have happened recently). Link: https://lkml.kernel.org/r/20230818202335.2739663-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 36 ++++++++++++++++++++++------------- Documentation/filesystems/porting.rst | 11 +++++++++++ mm/memory.c | 22 ++------------------- 3 files changed, 36 insertions(+), 33 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 211a03053992..1a2cb60b2499 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -628,26 +628,29 @@ vm_operations_struct prototypes:: - void (*open)(struct vm_area_struct*); - void (*close)(struct vm_area_struct*); - vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *); + void (*open)(struct vm_area_struct *); + void (*close)(struct vm_area_struct *); + vm_fault_t (*fault)(struct vm_fault *); + vm_fault_t (*huge_fault)(struct vm_fault *, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *, pgoff_t start, pgoff_t end); vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: -============= ========= =========================== +============= ========== =========================== ops mmap_lock PageLocked(page) -============= ========= =========================== -open: yes -close: yes -fault: yes can return with page locked -map_pages: read -page_mkwrite: yes can return with page locked -pfn_mkwrite: yes -access: yes -============= ========= =========================== +============= ========== =========================== +open: write +close: read/write +fault: read can return with page locked +huge_fault: maybe-read +map_pages: maybe-read +page_mkwrite: read can return with page locked +pfn_mkwrite: read +access: read +============= ========== =========================== ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated with the passed in @@ -657,6 +660,13 @@ then ensure the page is not already truncated (invalidate_lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. +->huge_fault() is called when there is no PUD or PMD entry present. This +gives the filesystem the opportunity to install a PUD or PMD sized page. +Filesystems can also use the ->fault method to return a PMD sized page, +so implementing this function may not be necessary. In particular, +filesystems should not call filemap_fault() from ->huge_fault(). +The mmap_lock may not be held when this method is called. + ->map_pages() is called when VM asks to map easy accessible pages. Filesystem should find and map pages associated with offsets from "start_pgoff" till "end_pgoff". ->map_pages() is called with the RCU lock held and must diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index d2d684ae7798..7ce352265de1 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -943,3 +943,14 @@ file pointer instead of struct dentry pointer. d_tmpfile() is similarly changed to simplify callers. The passed file is in a non-open state and on success must be opened before returning (e.g. by calling finish_open_simple()). + +--- + +**mandatory** + +Calling convention for ->huge_fault has changed. It now takes a page +order instead of an enum page_entry_size, and it may be called without the +mmap_lock held. All in-tree users have been audited and do not seem to +depend on the mmap_lock being held, but out of tree users should verify +for themselves. If they do need it, they can return VM_FAULT_RETRY to +be called with the mmap_lock held. diff --git a/mm/memory.c b/mm/memory.c index 50f44c1bfa19..7a7e58729510 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4854,13 +4854,8 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - } return VM_FAULT_FALLBACK; } @@ -4880,10 +4875,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4904,13 +4895,8 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4927,10 +4913,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); if (!(ret & VM_FAULT_FALLBACK)) return ret; -- cgit v1.2.3 From 1d024e7a8dabcc3c84d77532a88c774c32cf8245 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:35 +0100 Subject: mm: remove enum page_entry_size Remove the unnecessary encoding of page order into an enum and pass the page order directly. That lets us get rid of pe_order(). The switch constructs have to be changed to if/else constructs to prevent GCC from warning on builds with 3-level page tables where PMD_ORDER and PUD_ORDER have the same value. If you are looking at this commit because your driver stopped compiling, look at the previous commit as well and audit your driver to be sure it doesn't depend on mmap_lock being held in its ->huge_fault method. [willy@infradead.org: use "order %u" to match the (non dev_t) style] Link: https://lkml.kernel.org/r/ZOUYekbtTv+n8hYf@casper.infradead.org Link: https://lkml.kernel.org/r/20230818202335.2739663-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/dax/device.c | 22 ++++++++-------------- fs/dax.c | 30 ++++++++---------------------- fs/erofs/data.c | 6 +++--- fs/ext2/file.c | 2 +- fs/ext4/file.c | 11 +++++------ fs/fuse/dax.c | 20 +++++++++----------- fs/xfs/xfs_file.c | 24 ++++++++++++------------ fs/xfs/xfs_trace.h | 20 ++++++-------------- include/linux/dax.h | 4 ++-- include/linux/mm.h | 10 +--------- mm/memory.c | 8 ++++---- 11 files changed, 59 insertions(+), 98 deletions(-) (limited to 'mm') diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 30665a3ff6ea..93ebedc5ec8c 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -228,32 +228,26 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { struct file *filp = vmf->vma->vm_file; vm_fault_t rc = VM_FAULT_SIGBUS; int id; struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, + dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) order:%d\n", current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end, pe_size); + vmf->vma->vm_start, vmf->vma->vm_end, order); id = dax_read_lock(); - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) rc = __dev_dax_pte_fault(dev_dax, vmf); - break; - case PE_SIZE_PMD: + else if (order == PMD_ORDER) rc = __dev_dax_pmd_fault(dev_dax, vmf); - break; - case PE_SIZE_PUD: + else if (order == PUD_ORDER) rc = __dev_dax_pud_fault(dev_dax, vmf); - break; - default: + else rc = VM_FAULT_SIGBUS; - } dax_read_unlock(id); @@ -262,7 +256,7 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, static vm_fault_t dev_dax_fault(struct vm_fault *vmf) { - return dev_dax_huge_fault(vmf, PE_SIZE_PTE); + return dev_dax_huge_fault(vmf, 0); } static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) diff --git a/fs/dax.c b/fs/dax.c index 88bb13643117..8fafecbe42b1 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -30,17 +30,6 @@ #define CREATE_TRACE_POINTS #include -static inline unsigned int pe_order(enum page_entry_size pe_size) -{ - if (pe_size == PE_SIZE_PTE) - return PAGE_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PMD) - return PMD_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PUD) - return PUD_SHIFT - PAGE_SHIFT; - return ~0; -} - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -1905,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @pe_size: Size of the page to fault in + * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system @@ -1915,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); - case PE_SIZE_PMD: + else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); - default: + else return VM_FAULT_FALLBACK; - } } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1976,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted + * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ -vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, + pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - unsigned int order = pe_order(pe_size); size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index db5e4b7636ec..0c2c99c58b5e 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0b4c91c62e1f..1039e5bf90af 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) } filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0f..2dc3f8301225 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -723,8 +723,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +739,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +763,7 @@ retry: } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +772,7 @@ retry: goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +784,7 @@ retry: static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8e74f278a3f6..23904a6a9a96 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping, return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); } -static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, bool write) +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, + bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -809,7 +809,7 @@ retry: * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; @@ -818,7 +818,7 @@ retry: } if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -829,24 +829,22 @@ retry: static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, - vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } -static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { - return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4f502219ae4f..203700278ddb 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1287,11 +1287,11 @@ xfs_file_llseek( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { - return dax_iomap_fault(vmf, pe_size, pfn, NULL, + return dax_iomap_fault(vmf, order, pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); @@ -1300,7 +1300,7 @@ xfs_dax_fault( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { @@ -1322,14 +1322,14 @@ xfs_dax_fault( static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; - trace_xfs_filemap_fault(ip, pe_size, write_fault); + trace_xfs_filemap_fault(ip, order, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); @@ -1340,9 +1340,9 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); + ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { if (write_fault) { @@ -1373,7 +1373,7 @@ xfs_filemap_fault( struct vm_fault *vmf) { /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + return __xfs_filemap_fault(vmf, 0, IS_DAX(file_inode(vmf->vma->vm_file)) && xfs_is_write_fault(vmf)); } @@ -1381,13 +1381,13 @@ xfs_filemap_fault( static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, pe_size, + return __xfs_filemap_fault(vmf, order, xfs_is_write_fault(vmf)); } @@ -1395,7 +1395,7 @@ static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } /* @@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f3cc204bb4bf..fd789e00dfd6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -802,36 +802,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(PE_SIZE_PTE); -TRACE_DEFINE_ENUM(PE_SIZE_PMD); -TRACE_DEFINE_ENUM(PE_SIZE_PUD); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, - bool write_fault), - TP_ARGS(ip, pe_size, write_fault), + TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), + TP_ARGS(ip, order, write_fault), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(enum page_entry_size, pe_size) + __field(unsigned int, order) __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->pe_size = pe_size; + __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->pe_size, - { PE_SIZE_PTE, "PTE" }, - { PE_SIZE_PMD, "PMD" }, - { PE_SIZE_PUD, "PUD" }), + __entry->order, __entry->write_fault) ) diff --git a/include/linux/dax.h b/include/linux/dax.h index 261944ec0887..22cd9902345d 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -241,10 +241,10 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn); + unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/mm.h b/include/linux/mm.h index ddb95967ba64..53efddc4d178 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,13 +532,6 @@ struct vm_fault { */ }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -562,8 +555,7 @@ struct vm_operations_struct { int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, - enum page_entry_size pe_size); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); diff --git a/mm/memory.c b/mm/memory.c index 7a7e58729510..00a5ce113090 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4855,7 +4855,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } @@ -4875,7 +4875,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } @@ -4896,7 +4896,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4913,7 +4913,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } -- cgit v1.2.3 From 19134bc23500a01bfdb77a804fc8e4bf8808d0cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:27 +0100 Subject: mm: fix kernel-doc warning from tlb_flush_rmaps() Patch series "Improve mm documentation". If you build with W=1, kernel-doc complains about tlb_flush_rmaps(). Then I ran scripts/find-unused-docs.sh against mm/ and found a large number of files which weren't included in the ReST documentation. I fixed up a couple of them, and added all those without erros to the rst files. There's a lot more work to do to organise all of this, but at least now if we have documentation that refers to these functions, we'll get a nice link to them. This patch (of 4): The vma parameter wasn't described. Link: https://lkml.kernel.org/r/20230818200630.2719595-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230818200630.2719595-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index ea9683e12936..4f559f4ddd21 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -63,6 +63,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_ /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather + * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so -- cgit v1.2.3 From 853f62a30422f1a9a9c1f4b44df9b0de0d46a9e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:28 +0100 Subject: mm: fix get_mctgt_type() kernel-doc Convert the return values to an ReST list and tidy up the wording while I'm touching it. [akpm@linux-foundation.org: changes suggested by Randy] [willy@infradead.org: another change suggested by Randy] Link: https://lkml.kernel.org/r/ZOUZtZizeQG7PcsM@casper.infradead.org Link: https://lkml.kernel.org/r/20230818200630.2719595-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0286bd0ab043..b29b850cf399 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5850,25 +5850,20 @@ out: * @ptent: the pte to be checked * @target: the pointer the target page or swap ent will be stored(can be NULL) * - * Returns - * 0(MC_TARGET_NONE): if the pte is not a target for move charge. - * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for - * move charge. if @target is not NULL, the page is stored in target->page - * with extra refcnt got(Callers should handle it). - * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a - * target for charge migration. if @target is not NULL, the entry is stored - * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and - * thus not on the lru. - * For now we such page is charge like a regular page would be as for all - * intent and purposes it is just special memory taking the place of a - * regular page. - * - * See Documentations/vm/hmm.txt and include/linux/hmm.h - * - * Called with pte lock held. + * Context: Called with pte lock held. + * Return: + * * MC_TARGET_NONE - If the pte is not a target for move charge. + * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for + * move charge. If @target is not NULL, the page is stored in target->page + * with extra refcnt taken (Caller should release it). + * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a + * target for charge migration. If @target is not NULL, the entry is + * stored in target->ent. + * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. For now such page is charged like a regular page + * would be as it is just special memory taking the place of a regular page. + * See Documentations/vm/hmm.txt and include/linux/hmm.h */ - static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { -- cgit v1.2.3 From 01a7eb3e20994701700631ec30462087c4ecf142 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:29 +0100 Subject: mm: fix clean_record_shared_mapping_range kernel-doc Turn the a), b) into an unordered ReST list and remove the unnecessary 'Note:' prefix. Link: https://lkml.kernel.org/r/20230818200630.2719595-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mapping_dirty_helpers.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index a26dd8bcfcdb..2f8829b3541a 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -288,13 +288,14 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range); * @end: Pointer to the number of the last set bit in @bitmap. * none set. The value is modified as new bits are set by the function. * - * Note: When this function returns there is no guarantee that a CPU has + * When this function returns there is no guarantee that a CPU has * not already dirtied new ptes. However it will not clean any ptes not * reported in the bitmap. The guarantees are as follows: - * a) All ptes dirty when the function starts executing will end up recorded - * in the bitmap. - * b) All ptes dirtied after that will either remain dirty, be recorded in the - * bitmap or both. + * + * * All ptes dirty when the function starts executing will end up recorded + * in the bitmap. + * * All ptes dirtied after that will either remain dirty, be recorded in the + * bitmap or both. * * If a caller needs to make sure all dirty ptes are picked up and none * additional are added, it first needs to write-protect the address-space -- cgit v1.2.3 From 8cfd014efd93e9450fcd4892bbfe8b10f41e53c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 18:24:59 +0100 Subject: hugetlb: add documentation for vma_kernel_pagesize() This is an exported symbol, so it should have kernel-doc. Update it to mention folios, and point out that they might be larger than the supported page size for this VMA. Link: https://lkml.kernel.org/r/20230822172459.4190699-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbc25826c9b0..ba6d39b71cb1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -968,9 +968,14 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(linear_hugepage_index); -/* - * Return the size of the pages allocated when backing a VMA. In the majority - * cases this will be same size as used by the page table entries. +/** + * vma_kernel_pagesize - Page size granularity for this VMA. + * @vma: The user mapping. + * + * Folios in this VMA will be aligned to, and at least the size of the + * number of bytes returned by this function. + * + * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { -- cgit v1.2.3 From 8f9ff2deb8b91ad1cba666c7adbe4ca79e2d3225 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 21:23:35 +0100 Subject: secretmem: convert page_is_secretmem() to folio_is_secretmem() The only caller already has a folio, so use it to save calling compound_head() in PageLRU() and remove a use of page->mapping. Link: https://lkml.kernel.org/r/20230822202335.179081-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 15 +++++++-------- mm/gup.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 988528b5da43..35f3a4a8ceb1 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -6,24 +6,23 @@ extern const struct address_space_operations secretmem_aops; -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { struct address_space *mapping; /* - * Using page_mapping() is quite slow because of the actual call - * instruction and repeated compound_head(page) inside the - * page_mapping() function. + * Using folio_mapping() is quite slow because of the actual call + * instruction. * We know that secretmem pages are not compound and LRU so we can * save a couple of cycles here. */ - if (PageCompound(page) || !PageLRU(page)) + if (folio_test_large(folio) || !folio_test_lru(folio)) return false; mapping = (struct address_space *) - ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); + ((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS); - if (!mapping || mapping != page->mapping) + if (!mapping || mapping != folio->mapping) return false; return mapping->a_ops == &secretmem_aops; @@ -39,7 +38,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma) return false; } -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { return false; } diff --git a/mm/gup.c b/mm/gup.c index ee4fc15ce88e..948f3b454b00 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2600,7 +2600,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(page_is_secretmem(page))) { + if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } -- cgit v1.2.3