From bbefa0fc04bab21e85f6b2ee7984c59694366f6a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:36 +0800 Subject: mm/compaction: use correct list in move_freelist_{head}/{tail} Patch series "Fixes and cleanups to compaction", v3. This is a series to do fix and clean up to compaction. Patch 1-2 fix and clean up freepage list operation. Patch 3-4 fix and clean up isolation of freepages Patch 7 factor code to check if compaction is needed for allocation order. More details can be found in respective patches. This patch (of 6): The freepage is chained with buddy_list in freelist head. Use buddy_list instead of lru to correct the list operation. Link: https://lkml.kernel.org/r/20230901155141.249860-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230901155141.249860-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 38c8d216c6a3..e3ee1bc1c0ad 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1395,8 +1395,8 @@ move_freelist_head(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_last(freelist, &freepage->lru)) { - list_cut_before(&sublist, freelist, &freepage->lru); + if (!list_is_last(freelist, &freepage->buddy_list)) { + list_cut_before(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } } @@ -1412,8 +1412,8 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_first(freelist, &freepage->lru)) { - list_cut_position(&sublist, freelist, &freepage->lru); + if (!list_is_first(freelist, &freepage->buddy_list)) { + list_cut_position(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } } -- cgit v1.2.3 From 4c17989116cb0a6a91f4184077c342a9097b748e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:37 +0800 Subject: mm/compaction: call list_is_{first}/{last} more intuitively in move_freelist_{head}/{tail} We use move_freelist_head after list_for_each_entry_reverse to skip recent pages. And there is no need to do actual move if all freepages are searched in list_for_each_entry_reverse, e.g. freepage point to first page in freelist. It's more intuitively to call list_is_first with list entry as the first argument and list head as the second argument to check if list entry is the first list entry instead of call list_is_last with list entry and list head passed in reverse. Similarly, call list_is_last in move_freelist_tail is more intuitively. Link: https://lkml.kernel.org/r/20230901155141.249860-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e3ee1bc1c0ad..a40550a33aee 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1395,7 +1395,7 @@ move_freelist_head(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_last(freelist, &freepage->buddy_list)) { + if (!list_is_first(&freepage->buddy_list, freelist)) { list_cut_before(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } @@ -1412,7 +1412,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) { LIST_HEAD(sublist); - if (!list_is_first(freelist, &freepage->buddy_list)) { + if (!list_is_last(&freepage->buddy_list, freelist)) { list_cut_position(&sublist, freelist, &freepage->buddy_list); list_splice_tail(&sublist, freelist); } -- cgit v1.2.3 From 3da0272a4c7d0d37b47b28e87014f421296fc2be Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:38 +0800 Subject: mm/compaction: correctly return failure with bogus compound_order in strict mode In strict mode, we should return 0 if there is any hole in pageblock. If we successfully isolated pages at beginning at pageblock and then have a bogus compound_order outside pageblock in next page. We will abort search loop with blockpfn > end_pfn. Although we will limit blockpfn to end_pfn, we will treat it as a successful isolation in strict mode as blockpfn is not < end_pfn and return partial isolated pages. Then isolate_freepages_range may success unexpectly with hole in isolated range. Link: https://lkml.kernel.org/r/20230901155141.249860-4-shikemeng@huaweicloud.com Fixes: 9fcd6d2e052e ("mm, compaction: skip compound pages by order in free scanner") Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index a40550a33aee..9ecbfbc695e5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -626,11 +626,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (PageCompound(page)) { const unsigned int order = compound_order(page); - if (likely(order <= MAX_ORDER)) { + if (blockpfn + (1UL << order) <= end_pfn) { blockpfn += (1UL << order) - 1; page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } + goto isolate_fail; } @@ -678,8 +679,7 @@ isolate_fail: spin_unlock_irqrestore(&cc->zone->lock, flags); /* - * There is a tiny chance that we have read bogus compound_order(), - * so be careful to not go outside of the pageblock. + * Be careful to not go outside of the pageblock. */ if (unlikely(blockpfn > end_pfn)) blockpfn = end_pfn; -- cgit v1.2.3 From 8df4e28c64188911fba33789bf2cb882b3ae524e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:39 +0800 Subject: mm/compaction: remove repeat compact_blockskip_flush check in reset_isolation_suitable We have compact_blockskip_flush check in __reset_isolation_suitable, just remove repeat check before __reset_isolation_suitable in compact_blockskip_flush. Link: https://lkml.kernel.org/r/20230901155141.249860-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 9ecbfbc695e5..c377d78e0f15 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -382,6 +382,7 @@ static void __reset_isolation_suitable(struct zone *zone) bool source_set = false; bool free_set = false; + /* Only flush if a full compaction finished recently */ if (!zone->compact_blockskip_flush) return; @@ -434,9 +435,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Only flush if a full compaction finished recently */ - if (zone->compact_blockskip_flush) - __reset_isolation_suitable(zone); + __reset_isolation_suitable(zone); } } -- cgit v1.2.3 From 9cc17ede5125933ab47f8f359c2cce3aca8ee757 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:40 +0800 Subject: mm/compaction: improve comment of is_via_compact_memory We do proactive compaction with order == -1 via 1. /proc/sys/vm/compact_memory 2. /sys/devices/system/node/nodex/compact 3. /proc/sys/vm/compaction_proactiveness Add missed situation in which order == -1. Link: https://lkml.kernel.org/r/20230901155141.249860-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Acked-by: Mel Gorman Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/compaction.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index c377d78e0f15..ff3426a0d9c5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2065,8 +2065,10 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) } /* - * order == -1 is expected when compacting via - * /proc/sys/vm/compact_memory + * order == -1 is expected when compacting proactively via + * 1. /proc/sys/vm/compact_memory + * 2. /sys/devices/system/node/nodex/compact + * 3. /proc/sys/vm/compaction_proactiveness */ static inline bool is_via_compact_memory(int order) { -- cgit v1.2.3 From e19a3f595ae47bd8c034b98eb0b28a3877413387 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 1 Sep 2023 23:51:41 +0800 Subject: mm/compaction: factor out code to test if we should run compaction for target order We always do zone_watermark_ok check and compaction_suitable check together to test if compaction for target order should be ran. Factor these code out to remove repeat code. Link: https://lkml.kernel.org/r/20230901155141.249860-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 66 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index ff3426a0d9c5..01ba298739dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2378,6 +2378,30 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order, return false; } +/* + * Should we do compaction for target allocation order. + * Return COMPACT_SUCCESS if allocation for target order can be already + * satisfied + * Return COMPACT_SKIPPED if compaction for target order is likely to fail + * Return COMPACT_CONTINUE if compaction for target order should be ran + */ +static enum compact_result +compaction_suit_allocation_order(struct zone *zone, unsigned int order, + int highest_zoneidx, unsigned int alloc_flags) +{ + unsigned long watermark; + + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); + if (zone_watermark_ok(zone, order, watermark, highest_zoneidx, + alloc_flags)) + return COMPACT_SUCCESS; + + if (!compaction_suitable(zone, order, highest_zoneidx)) + return COMPACT_SKIPPED; + + return COMPACT_CONTINUE; +} + static enum compact_result compact_zone(struct compact_control *cc, struct capture_control *capc) { @@ -2403,19 +2427,11 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) cc->migratetype = gfp_migratetype(cc->gfp_mask); if (!is_via_compact_memory(cc->order)) { - unsigned long watermark; - - /* Allocation can already succeed, nothing to do */ - watermark = wmark_pages(cc->zone, - cc->alloc_flags & ALLOC_WMARK_MASK); - if (zone_watermark_ok(cc->zone, cc->order, watermark, - cc->highest_zoneidx, cc->alloc_flags)) - return COMPACT_SUCCESS; - - /* Compaction is likely to fail */ - if (!compaction_suitable(cc->zone, cc->order, - cc->highest_zoneidx)) - return COMPACT_SKIPPED; + ret = compaction_suit_allocation_order(cc->zone, cc->order, + cc->highest_zoneidx, + cc->alloc_flags); + if (ret != COMPACT_CONTINUE) + return ret; } /* @@ -2914,6 +2930,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) int zoneid; struct zone *zone; enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx; + enum compact_result ret; for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) { zone = &pgdat->node_zones[zoneid]; @@ -2921,14 +2938,10 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) if (!populated_zone(zone)) continue; - /* Allocation can already succeed, check other zones */ - if (zone_watermark_ok(zone, pgdat->kcompactd_max_order, - min_wmark_pages(zone), - highest_zoneidx, 0)) - continue; - - if (compaction_suitable(zone, pgdat->kcompactd_max_order, - highest_zoneidx)) + ret = compaction_suit_allocation_order(zone, + pgdat->kcompactd_max_order, + highest_zoneidx, ALLOC_WMARK_MIN); + if (ret == COMPACT_CONTINUE) return true; } @@ -2951,6 +2964,8 @@ static void kcompactd_do_work(pg_data_t *pgdat) .ignore_skip_hint = false, .gfp_mask = GFP_KERNEL, }; + enum compact_result ret; + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, cc.highest_zoneidx); count_compact_event(KCOMPACTD_WAKE); @@ -2965,12 +2980,9 @@ static void kcompactd_do_work(pg_data_t *pgdat) if (compaction_deferred(zone, cc.order)) continue; - /* Allocation can already succeed, nothing to do */ - if (zone_watermark_ok(zone, cc.order, - min_wmark_pages(zone), zoneid, 0)) - continue; - - if (!compaction_suitable(zone, cc.order, zoneid)) + ret = compaction_suit_allocation_order(zone, + cc.order, zoneid, ALLOC_WMARK_MIN); + if (ret != COMPACT_CONTINUE) continue; if (kthread_should_stop()) -- cgit v1.2.3 From b6afcb94ce331b6d94dbeb56def173483993faf3 Mon Sep 17 00:00:00 2001 From: Ding Xiang Date: Thu, 31 Aug 2023 17:31:44 +0800 Subject: selftests/mm: gup_longterm: fix a resource leak The opened file should be closed in run_with_tmpfile(), otherwise resource leak will occur Link: https://lkml.kernel.org/r/20230831093144.7520-1-dingxiang@cmss.chinamobile.com Signed-off-by: Ding Xiang Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/gup_longterm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/gup_longterm.c b/tools/testing/selftests/mm/gup_longterm.c index d33d3e68ffab..ad168d35b23b 100644 --- a/tools/testing/selftests/mm/gup_longterm.c +++ b/tools/testing/selftests/mm/gup_longterm.c @@ -265,10 +265,11 @@ static void run_with_tmpfile(test_fn fn, const char *desc) fd = fileno(file); if (fd < 0) { ksft_test_result_fail("fileno() failed\n"); - return; + goto close; } fn(fd, pagesize); +close: fclose(file); } -- cgit v1.2.3 From 954652b9f33bb1892ea4448479d78779e4a8ae13 Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 29 Aug 2023 17:45:49 -0700 Subject: mm/mremap: fix unaccount of memory on vma_merge() failure Fix mremap so that only accounted memory is unaccounted if the mapping is expandable but vma_merge() fails. Link: https://lkml.kernel.org/r/20230830004549.16131-1-anthony.yznaga@oracle.com Fixes: fdbef6149135 ("mm/mremap: don't account pages in vma_to_resize()") Signed-off-by: Anthony Yznaga Acked-by: Brian Geffon Signed-off-by: Andrew Morton --- mm/mremap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..fbb4861964f6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1037,12 +1037,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, pgoff_t extension_pgoff = vma->vm_pgoff + ((extension_start - vma->vm_start) >> PAGE_SHIFT); VMA_ITERATOR(vmi, mm, extension_start); + long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { ret = -ENOMEM; goto out; } + charged = pages; } /* @@ -1058,7 +1060,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, vma->vm_file, extension_pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (!vma) { - vm_unacct_memory(pages); + vm_unacct_memory(charged); ret = -ENOMEM; goto out; } -- cgit v1.2.3 From dd34d9fe3b427dea47f81443d022dbaec4339f7e Mon Sep 17 00:00:00 2001 From: Anthony Yznaga Date: Tue, 29 Aug 2023 17:43:24 -0700 Subject: mm: fix unaccount of memory on vma_link() failure Fix insert_vm_struct() so that only accounted memory is unaccounted if vma_link() fails. Link: https://lkml.kernel.org/r/20230830004324.16101-1-anthony.yznaga@oracle.com Fixes: d4af56c5c7c6 ("mm: start tracking VMAs with maple tree") Signed-off-by: Anthony Yznaga Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/mmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index b56a7f0c9f85..1ec6c875a7f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3278,7 +3278,8 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) } if (vma_link(mm, vma)) { - vm_unacct_memory(charged); + if (vma->vm_flags & VM_ACCOUNT) + vm_unacct_memory(charged); return -ENOMEM; } -- cgit v1.2.3 From d8f5f7e445f02eb10dee1a0a992146314cf460f8 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 29 Aug 2023 14:37:34 -0700 Subject: hugetlb: set hugetlb page flag before optimizing vmemmap Currently, vmemmap optimization of hugetlb pages is performed before the hugetlb flag (previously hugetlb destructor) is set identifying it as a hugetlb folio. This means there is a window of time where an ordinary folio does not have all associated vmemmap present. The core mm only expects vmemmap to be potentially optimized for hugetlb and device dax. This can cause problems in code such as memory error handling that may want to write to tail struct pages. There is only one call to perform hugetlb vmemmap optimization today. To fix this issue, simply set the hugetlb flag before that call. There was a similar issue in the free hugetlb path that was previously addressed. The two routines that optimize or restore hugetlb vmemmap should only be passed hugetlb folios/pages. To catch any callers not following this rule, add VM_WARN_ON calls to the routines. In the hugetlb free code paths, some calls could be made to restore vmemmap after clearing the hugetlb flag. This was 'safe' as in these cases vmemmap was already present and the call was a NOOP. However, for consistency these calls where eliminated so that we can add the VM_WARN_ON checks. Link: https://lkml.kernel.org/r/20230829213734.69673-1-mike.kravetz@oracle.com Fixes: f41f2ed43ca5 ("mm: hugetlb: free the vmemmap pages associated with each HugeTLB page") Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: James Houghton Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 31 ++++++++++++++++++++++--------- mm/hugetlb_vmemmap.c | 3 +++ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 52d26072dfda..afce9037f9ee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1720,7 +1720,12 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; - if (hugetlb_vmemmap_restore(h, &folio->page)) { + /* + * If folio is not vmemmap optimized (!clear_dtor), then the folio + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * can only be passed hugetlb pages and will BUG otherwise. + */ + if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1930,9 +1935,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { + folio_set_hugetlb(folio); hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -3580,13 +3585,21 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (rc) { - /* Allocation of vmemmmap failed, we can not demote folio */ - spin_lock_irq(&hugetlb_lock); - folio_ref_unfreeze(folio, 1); - add_hugetlb_folio(h, folio, false); - return rc; + /* + * If vmemmap already existed for folio, the remove routine above would + * have cleared the hugetlb folio flag. Hence the folio is technically + * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * passed hugetlb folios and will BUG otherwise. + */ + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + /* Allocation of vmemmmap failed, we can not demote folio */ + spin_lock_irq(&hugetlb_lock); + folio_ref_unfreeze(folio, 1); + add_hugetlb_folio(h, folio, false); + return rc; + } } /* diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4b9734777f69..aeb7dd889eee 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include "hugetlb_vmemmap.h" @@ -456,6 +457,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!HPageVmemmapOptimized(head)) return 0; @@ -550,6 +552,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; + VM_WARN_ON_ONCE(!PageHuge(head)); if (!vmemmap_should_optimize(h, head)) return; -- cgit v1.2.3 From b72b3c9c34c825c81d205241c5f822fc7835923f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Tue, 29 Aug 2023 11:33:43 +0800 Subject: mm/hugetlb: fix nodes huge page allocation when there are surplus pages In set_nr_huge_pages(), local variable "count" is used to record persistent_huge_pages(), but when it cames to nodes huge page allocation, the semantics changes to nr_huge_pages. When there exists surplus huge pages and using the interface under /sys/devices/system/node/node*/hugepages to change huge page pool size, this difference can result in the allocation of an unexpected number of huge pages. Steps to reproduce the bug: Starting with: Node 0 Node 1 Total HugePages_Total 0.00 0.00 0.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 0.00 0.00 0.00 create 100 huge pages in Node 0 and consume it, then set Node 0 's nr_hugepages to 0. yields: Node 0 Node 1 Total HugePages_Total 200.00 0.00 200.00 HugePages_Free 0.00 0.00 0.00 HugePages_Surp 200.00 0.00 200.00 write 100 to Node 1's nr_hugepages echo 100 > /sys/devices/system/node/node1/\ hugepages/hugepages-2048kB/nr_hugepages gets: Node 0 Node 1 Total HugePages_Total 200.00 400.00 600.00 HugePages_Free 0.00 400.00 400.00 HugePages_Surp 200.00 0.00 200.00 Kernel is expected to create only 100 huge pages and it gives 200. Link: https://lkml.kernel.org/r/20230829033343.467779-1-xueshi.hu@smartx.com Fixes: 9a30523066cd ("hugetlb: add per node hstate attributes") Signed-off-by: Xueshi Hu Reviewed-by: Mike Kravetz Cc: Andi Kleen Cc: Lee Schermerhorn Cc: Mel Gorman Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index afce9037f9ee..7c90c43574a6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3457,7 +3457,9 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, if (nid != NUMA_NO_NODE) { unsigned long old_count = count; - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + count += persistent_huge_pages(h) - + (h->nr_huge_pages_node[nid] - + h->surplus_huge_pages_node[nid]); /* * User may have specified a large count value which caused the * above calculation to overflow. In this case, they wanted -- cgit v1.2.3 From 80e4a765a770c491e081463c38be927a876e2ce8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 27 Aug 2023 12:08:48 +0100 Subject: mm: refactor si_mem_available() si_mem_available() needlessly places LRU statistics into an array before retrieving only two of them, simply access those directly. In addition, refactor the code so that the blocks of code which calculate the page cache and reclaimable components each resemble one another to clearly indicate we cap both against wmark_low in the same fashion. Link: https://lkml.kernel.org/r/20230827110848.43510-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/show_mem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index 4b888b18bdde..ba0808d6917f 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -34,13 +34,8 @@ long si_mem_available(void) long available; unsigned long pagecache; unsigned long wmark_low = 0; - unsigned long pages[NR_LRU_LISTS]; unsigned long reclaimable; struct zone *zone; - int lru; - - for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) - pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) wmark_low += low_wmark_pages(zone); @@ -56,7 +51,8 @@ long si_mem_available(void) * start swapping or thrashing. Assume at least half of the page * cache, or the low watermark worth of cache, needs to stay. */ - pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); pagecache -= min(pagecache / 2, wmark_low); available += pagecache; @@ -67,7 +63,8 @@ long si_mem_available(void) */ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - available += reclaimable - min(reclaimable / 2, wmark_low); + reclaimable -= min(reclaimable / 2, wmark_low); + available += reclaimable; if (available < 0) available = 0; -- cgit v1.2.3 From 97144ce008f918249fa7275ee1d29f6f27665c34 Mon Sep 17 00:00:00 2001 From: Vern Hao Date: Fri, 25 Aug 2023 15:57:34 +0800 Subject: mm/vmscan: use folio_migratetype() instead of get_pageblock_migratetype() In skip_cma(), we can use folio_migratetype() to replace get_pageblock_migratetype(). Link: https://lkml.kernel.org/r/20230825075735.52436-1-user@VERNHAO-MC1 Signed-off-by: Vern Hao Reviewed-by: David Hildenbrand Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f13394b112e..93fdeadea896 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2271,7 +2271,7 @@ static bool skip_cma(struct folio *folio, struct scan_control *sc) { return !current_is_kswapd() && gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE && - get_pageblock_migratetype(&folio->page) == MIGRATE_CMA; + folio_migratetype(folio) == MIGRATE_CMA; } #else static bool skip_cma(struct folio *folio, struct scan_control *sc) -- cgit v1.2.3 From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- fs/exec.c | 2 -- include/linux/mm.h | 8 -------- kernel/exit.c | 4 ---- kernel/fork.c | 4 ---- kernel/kthread.c | 1 - mm/madvise.c | 5 +---- mm/memory.c | 2 -- 7 files changed, 1 insertion(+), 25 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 6518e33ea813..eb039041ad6a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm) tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); - if (old_mm) - sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) diff --git a/include/linux/mm.h b/include/linux/mm.h index bf5d0b1b16f4..7613150acab9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2628,14 +2628,6 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, *maxrss = hiwater_rss; } -#if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct mm_struct *mm); -#else -static inline void sync_mm_rss(struct mm_struct *mm) -{ -} -#endif - #ifndef CONFIG_ARCH_HAS_PTE_SPECIAL static inline int pte_special(pte_t pte) { diff --git a/kernel/exit.c b/kernel/exit.c index edb50b4c9972..3cdbe797008f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -539,7 +539,6 @@ static void exit_mm(void) exit_mm_release(current, mm); if (!mm) return; - sync_mm_rss(mm); mmap_read_lock(mm); mmgrab_lazy_tlb(mm); BUG_ON(mm != current->active_mm); @@ -829,9 +828,6 @@ void __noreturn do_exit(long code) io_uring_files_cancel(); exit_signals(tsk); /* sets PF_EXITING */ - /* sync mm's RSS info before statistics gathering */ - if (tsk->mm) - sync_mm_rss(tsk->mm); acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { diff --git a/kernel/fork.c b/kernel/fork.c index 3b6d20dfb9a8..1779183a7cb3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2406,10 +2406,6 @@ __latent_entropy struct task_struct *copy_process( p->io_uring = NULL; #endif -#if defined(SPLIT_RSS_COUNTING) - memset(&p->rss_stat, 0, sizeof(p->rss_stat)); -#endif - p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI diff --git a/kernel/kthread.c b/kernel/kthread.c index 1eea53050bab..c46128ec0c0a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1469,7 +1469,6 @@ void kthread_unuse_mm(struct mm_struct *mm) * clearing tsk->mm. */ smp_mb__after_spinlock(); - sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); diff --git a/mm/madvise.c b/mm/madvise.c index 4dded5d27e7e..59e8860c86af 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -746,11 +746,8 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, folio_mark_lazyfree(folio); } - if (nr_swap) { - if (current->mm == mm) - sync_mm_rss(mm); + if (nr_swap) add_mm_counter(mm, MM_SWAPENTS, nr_swap); - } if (start_pte) { arch_leave_lazy_mmu_mode(); pte_unmap_unlock(start_pte, ptl); diff --git a/mm/memory.c b/mm/memory.c index 6c264d2f969c..0739ccb00e61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -471,8 +471,6 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; - if (current->mm == mm) - sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); -- cgit v1.2.3 From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 23 Aug 2023 00:13:14 +0100 Subject: mm: convert DAX lock/unlock page to lock/unlock folio The one caller of DAX lock/unlock page already calls compound_head(), so use page_folio() instead, then use a folio throughout the DAX code to remove uses of page->mapping and page->index. [jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization] Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jane Chu Acked-by: Naoya Horiguchi Cc: Dan Williams Cc: Jane Chu Signed-off-by: Andrew Morton --- fs/dax.c | 24 ++++++++++++------------ include/linux/dax.h | 10 +++++----- mm/memory-failure.c | 29 ++++++++++++++++------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 8fafecbe42b1..3380b43cb6bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry) return NULL; } -/* - * dax_lock_page - Lock the DAX entry corresponding to a page - * @page: The page whose entry we want to lock +/** + * dax_lock_folio - Lock the DAX entry corresponding to a folio + * @folio: The folio whose entry we want to lock * * Context: Process context. - * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could + * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked. */ -dax_entry_t dax_lock_page(struct page *page) +dax_entry_t dax_lock_folio(struct folio *folio) { XA_STATE(xas, NULL, 0); void *entry; - /* Ensure page->mapping isn't freed while we look at it */ + /* Ensure folio->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { - struct address_space *mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(folio->mapping); entry = NULL; if (!mapping || !dax_mapping(mapping)) @@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page) xas.xa = &mapping->i_pages; xas_lock_irq(&xas); - if (mapping != page->mapping) { + if (mapping != folio->mapping) { xas_unlock_irq(&xas); continue; } - xas_set(&xas, page->index); + xas_set(&xas, folio->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); @@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page) return (dax_entry_t)entry; } -void dax_unlock_page(struct page *page, dax_entry_t cookie) +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { - struct address_space *mapping = page->mapping; - XA_STATE(xas, &mapping->i_pages, page->index); + struct address_space *mapping = folio->mapping; + XA_STATE(xas, &mapping->i_pages, folio->index); if (S_ISCHR(mapping->host->i_mode)) return; diff --git a/include/linux/dax.h b/include/linux/dax.h index 22cd9902345d..b463502b16e1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); -dax_entry_t dax_lock_page(struct page *page); -void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_folio(struct folio *folio); +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie); dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page); void dax_unlock_mapping_entry(struct address_space *mapping, @@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping, return -EOPNOTSUPP; } -static inline dax_entry_t dax_lock_page(struct page *page) +static inline dax_entry_t dax_lock_folio(struct folio *folio) { - if (IS_DAX(page->mapping->host)) + if (IS_DAX(folio->mapping->host)) return ~0UL; return 0; } -static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) +static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4d6e43c88489..660c21859118 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1713,20 +1713,23 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); } +/* + * Only dev_pagemap pages get here, such as fsdax when the filesystem + * either do not claim or fails to claim a hwpoison event, or devdax. + * The fsdax pages are initialized per base page, and the devdax pages + * could be initialized either as base pages, or as compound pages with + * vmemmap optimization enabled. Devdax is simplistic in its dealing with + * hwpoison, such that, if a subpage of a compound page is poisoned, + * simply mark the compound head page is by far sufficient. + */ static int mf_generic_kill_procs(unsigned long long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); + struct folio *folio = pfn_folio(pfn); LIST_HEAD(to_kill); dax_entry_t cookie; int rc = 0; - /* - * Pages instantiated by device-dax (not filesystem-dax) - * may be compound pages. - */ - page = compound_head(page); - /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by @@ -1734,11 +1737,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - cookie = dax_lock_page(page); + cookie = dax_lock_folio(folio); if (!cookie) return -EBUSY; - if (hwpoison_filter(page)) { + if (hwpoison_filter(&folio->page)) { rc = -EOPNOTSUPP; goto unlock; } @@ -1760,7 +1763,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * Use this flag as an indication that the dax page has been * remapped UC to prevent speculative consumption of poison. */ - SetPageHWPoison(page); + SetPageHWPoison(&folio->page); /* * Unlike System-RAM there is no possibility to swap in a @@ -1769,11 +1772,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &to_kill, true); + collect_procs(&folio->page, &to_kill, true); - unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); + unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags); unlock: - dax_unlock_page(page, cookie); + dax_unlock_folio(folio, cookie); return rc; } -- cgit v1.2.3 From 77cd814835df22e177a9caac1b046f7ffdfeedd0 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Mon, 4 Sep 2023 17:08:49 +0200 Subject: mm/vmstat: use this_cpu_try_cmpxchg in mod_{zone,node}_state Use this_cpu_try_cmpxchg instead of this_cpu_cmpxchg (*ptr, old, new) == old in mod_zone_state and mod_node_state. x86 CMPXCHG instruction returns success in ZF flag, so this change saves a compare after cmpxchg (and related move instruction in front of cmpxchg). Also, try_cmpxchg implicitly assigns old *ptr value to "old" when cmpxchg fails. There is no need to re-read the value in the loop. No functional change intended. Link: https://lkml.kernel.org/r/20230904150917.8318-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Signed-off-by: Andrew Morton --- mm/vmstat.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 00e81e99c6ee..894e4c88d241 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -559,8 +559,10 @@ static inline void mod_zone_state(struct zone *zone, { struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats; s8 __percpu *p = pcp->vm_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; + o = this_cpu_read(*p); do { z = 0; /* overflow to zone counters */ @@ -576,8 +578,7 @@ static inline void mod_zone_state(struct zone *zone, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -586,7 +587,7 @@ static inline void mod_zone_state(struct zone *zone, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) zone_page_state_add(z, zone, item); @@ -616,7 +617,8 @@ static inline void mod_node_state(struct pglist_data *pgdat, { struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats; s8 __percpu *p = pcp->vm_node_stat_diff + item; - long o, n, t, z; + long n, t, z; + s8 o; if (vmstat_item_in_bytes(item)) { /* @@ -629,6 +631,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, delta >>= PAGE_SHIFT; } + o = this_cpu_read(*p); do { z = 0; /* overflow to node counters */ @@ -644,8 +647,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, */ t = this_cpu_read(pcp->stat_threshold); - o = this_cpu_read(*p); - n = delta + o; + n = delta + (long)o; if (abs(n) > t) { int os = overstep_mode * (t >> 1) ; @@ -654,7 +656,7 @@ static inline void mod_node_state(struct pglist_data *pgdat, z = n + os; n = -os; } - } while (this_cpu_cmpxchg(*p, o, n) != o); + } while (!this_cpu_try_cmpxchg(*p, &o, n)); if (z) node_page_state_add(z, pgdat, item); -- cgit v1.2.3 From 2eaa6c2abb9dd55041a05c20c451790c124d5cf0 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Tue, 5 Sep 2023 20:45:03 +0800 Subject: mm: hugetlb_vmemmap: fix hugetlb page number decrease failed on movable nodes The decreasing of hugetlb pages number failed with the following message given: sh: page allocation failure: order:0, mode:0x204cc0(GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_THISNODE) CPU: 1 PID: 112 Comm: sh Not tainted 6.5.0-rc7-... #45 Hardware name: linux,dummy-virt (DT) Call trace: dump_backtrace.part.6+0x84/0xe4 show_stack+0x18/0x24 dump_stack_lvl+0x48/0x60 dump_stack+0x18/0x24 warn_alloc+0x100/0x1bc __alloc_pages_slowpath.constprop.107+0xa40/0xad8 __alloc_pages+0x244/0x2d0 hugetlb_vmemmap_restore+0x104/0x1e4 __update_and_free_hugetlb_folio+0x44/0x1f4 update_and_free_hugetlb_folio+0x20/0x68 update_and_free_pages_bulk+0x4c/0xac set_max_huge_pages+0x198/0x334 nr_hugepages_store_common+0x118/0x178 nr_hugepages_store+0x18/0x24 kobj_attr_store+0x18/0x2c sysfs_kf_write+0x40/0x54 kernfs_fop_write_iter+0x164/0x1dc vfs_write+0x3a8/0x460 ksys_write+0x6c/0x100 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x44/0x100 el0_svc_common.constprop.1+0x6c/0xe4 do_el0_svc+0x38/0x94 el0_svc+0x28/0x74 el0t_64_sync_handler+0xa0/0xc4 el0t_64_sync+0x174/0x178 Mem-Info: ... The reason is that the hugetlb pages being released are allocated from movable nodes, and with hugetlb_optimize_vmemmap enabled, vmemmap pages need to be allocated from the same node during the hugetlb pages releasing. With GFP_KERNEL and __GFP_THISNODE set, allocating from movable node is always failed. Fix this problem by removing __GFP_THISNODE. Link: https://lkml.kernel.org/r/20230905124503.24899-1-yuancan@huawei.com Fixes: ad2fa3717b74 ("mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page") Signed-off-by: Yuan Can Reviewed-by: Muchun Song Cc: Kefeng Wang Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index aeb7dd889eee..3e5387835ad6 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -381,7 +381,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { - gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_THISNODE; + gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; -- cgit v1.2.3 From af8ca1c149069176e6322a77b532e3ffd99ccffe Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:22 +0000 Subject: mm/mremap: optimize the start addresses in move_page_tables() Patch series "Optimize mremap during mutual alignment within PMD", v6. This patchset optimizes the start addresses in move_page_tables() and tests the changes. It addresses a warning [1] that occurs due to a downward, overlapping move on a mutually-aligned offset within a PMD during exec. By initiating the copy process at the PMD level when such alignment is present, we can prevent this warning and speed up the copying process at the same time. Linus Torvalds suggested this idea. Check the individual patches for more details. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ This patch (of 7): Recently, we see reports [1] of a warning that triggers due to move_page_tables() doing a downward and overlapping move on a mutually-aligned offset within a PMD. By mutual alignment, I mean the source and destination addresses of the mremap are at the same offset within a PMD. This mutual alignment along with the fact that the move is downward is sufficient to cause a warning related to having an allocated PMD that does not have PTEs in it. This warning will only trigger when there is mutual alignment in the move operation. A solution, as suggested by Linus Torvalds [2], is to initiate the copy process at the PMD level whenever such alignment is present. Implementing this approach will not only prevent the warning from being triggered, but it will also optimize the operation as this method should enhance the speed of the copy process whenever there's a possibility to start copying at the PMD level. Some more points: a. The optimization can be done only when both the source and destination of the mremap do not have anything mapped below it up to a PMD boundary. I add support to detect that. b. #1 is not a problem for the call to move_page_tables() from exec.c as nothing is expected to be mapped below the source. However, for non-overlapping mutually aligned moves as triggered by mremap(2), I added support for checking such cases. c. I currently only optimize for PMD moves, in the future I/we can build on this work and do PUD moves as well if there is a need for this. But I want to take it one step at a time. d. We need to be careful about mremap of ranges within the VMA itself. For this purpose, I added checks to determine if the address after alignment falls within its VMA itself. [1] https://lore.kernel.org/all/ZB2GTBD%2FLWTrkOiO@dhcp22.suse.cz/ [2] https://lore.kernel.org/all/CAHk-=whd7msp8reJPfeGNyt0LiySMT0egExx3TVZSX3Ok6X=9g@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230903151328.2981432-1-joel@joelfernandes.org Link: https://lkml.kernel.org/r/20230903151328.2981432-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Suggested-by: Linus Torvalds Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mremap.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/mm/mremap.c b/mm/mremap.c index fbb4861964f6..e2b65a17148e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -489,6 +489,53 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, return moved; } +/* + * A helper to check if a previous mapping exists. Required for + * move_page_tables() and realign_addr() to determine if a previous mapping + * exists before we can do realignment optimizations. + */ +static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, + unsigned long mask) +{ + unsigned long addr_masked = addr_to_align & mask; + + /* + * If @addr_to_align of either source or destination is not the beginning + * of the corresponding VMA, we can't align down or we will destroy part + * of the current mapping. + */ + if (vma->vm_start != addr_to_align) + return false; + + /* + * Make sure the realignment doesn't cause the address to fall on an + * existing mapping. + */ + return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; +} + +/* Opportunistically realign to specified boundary for faster copy. */ +static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, + unsigned long *new_addr, struct vm_area_struct *new_vma, + unsigned long mask) +{ + /* Skip if the addresses are already aligned. */ + if ((*old_addr & ~mask) == 0) + return; + + /* Only realign if the new and old addresses are mutually aligned. */ + if ((*old_addr & ~mask) != (*new_addr & ~mask)) + return; + + /* Ensure realignment doesn't cause overlap with existing mappings. */ + if (!can_align_down(old_vma, *old_addr, mask) || + !can_align_down(new_vma, *new_addr, mask)) + return; + + *old_addr = *old_addr & mask; + *new_addr = *new_addr & mask; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, @@ -508,6 +555,14 @@ unsigned long move_page_tables(struct vm_area_struct *vma, return move_hugetlb_page_tables(vma, new_vma, old_addr, new_addr, len); + /* + * If possible, realign addresses to PMD boundary for faster copy. + * Only realign if the mremap copying hits a PMD boundary. + */ + if ((vma != new_vma) + && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, old_addr, old_end); @@ -577,6 +632,13 @@ again: mmu_notifier_invalidate_range_end(&range); + /* + * Prevent negative return values when {old,new}_addr was realigned + * but we broke out of the above loop for the first PMD itself. + */ + if (len + old_addr < old_end) + return 0; + return len + old_addr - old_end; /* how much done */ } -- cgit v1.2.3 From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:23 +0000 Subject: mm/mremap: allow moves within the same VMA for stack moves For the stack move happening in shift_arg_pages(), the move is happening within the same VMA which spans the old and new ranges. In case the aligned address happens to fall within that VMA, allow such moves and don't abort the mremap alignment optimization. In the regular non-stack mremap case, we cannot allow any such moves as will end up destroying some part of the mapping (either the source of the move, or part of the existing mapping). So just avoid it for stack moves. Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- include/linux/mm.h | 2 +- mm/mremap.c | 33 +++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index eb039041ad6a..4aa19b24f281 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length, false)) + vma, new_start, length, false, true)) return -ENOMEM; lru_add_drain(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7613150acab9..21c3d4e8a282 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2480,7 +2480,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks); + bool need_rmap_locks, bool for_stack); /* * Flags used by change_protection(). For now we make it a bitmap so diff --git a/mm/mremap.c b/mm/mremap.c index e2b65a17148e..ce8a23ef325a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } /* - * A helper to check if a previous mapping exists. Required for - * move_page_tables() and realign_addr() to determine if a previous mapping - * exists before we can do realignment optimizations. + * A helper to check if aligning down is OK. The aligned address should fall + * on *no mapping*. For the stack moving down, that's a special move within + * the VMA that is created to span the source and destination of the move, + * so we make an exception for it. */ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask) + unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; @@ -504,9 +505,13 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (vma->vm_start != addr_to_align) + if (!for_stack && vma->vm_start != addr_to_align) return false; + /* In the stack case we explicitly permit in-VMA alignment. */ + if (for_stack && addr_masked >= vma->vm_start) + return true; + /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. @@ -517,7 +522,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali /* Opportunistically realign to specified boundary for faster copy. */ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask) + unsigned long mask, bool for_stack) { /* Skip if the addresses are already aligned. */ if ((*old_addr & ~mask) == 0) @@ -528,8 +533,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old return; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask) || - !can_align_down(new_vma, *new_addr, mask)) + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || + !can_align_down(new_vma, *new_addr, mask, for_stack)) return; *old_addr = *old_addr & mask; @@ -539,7 +544,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks) + bool need_rmap_locks, bool for_stack) { unsigned long extent, old_end; struct mmu_notifier_range range; @@ -559,9 +564,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if ((vma != new_vma) - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, + for_stack); flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, @@ -708,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks); + need_rmap_locks, false); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { @@ -722,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * and then proceed to unmap new area instead of old. */ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true); + true, false); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.2.3 From 99eb26d59ce38a9f9ac0d63421deca6073f45086 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:24 +0000 Subject: selftests: mm: fix failure case when new remap region was not found When a valid remap region could not be found, the source mapping is not cleaned up. Fix the goto statement such that the clean up happens. Link: https://lkml.kernel.org/r/20230903151328.2981432-4-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 5c3773de9f0f..6822d657f589 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -316,7 +316,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, if (addr + c.dest_alignment < addr) { ksft_print_msg("Couldn't find a valid region to remap to\n"); ret = -1; - goto out; + goto clean_up_src; } addr += c.dest_alignment; } -- cgit v1.2.3 From 8ed873d8e5cdec3852bcb59f7cc6d017f7bc0ab7 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:25 +0000 Subject: selftests: mm: add a test for mutually aligned moves > PMD size This patch adds a test case to check if a PMD-alignment optimization successfully happens. I add support to make sure there is some room before the source mapping, otherwise the optimization to trigger PMD-aligned move will be disabled as the kernel will detect that a mapping before the source exists and such optimization becomes impossible. Link: https://lkml.kernel.org/r/20230903151328.2981432-5-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 6822d657f589..6304eb0947a3 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -44,6 +44,7 @@ enum { _1MB = 1ULL << 20, _2MB = 2ULL << 20, _4MB = 4ULL << 20, + _5MB = 5ULL << 20, _1GB = 1ULL << 30, _2GB = 2ULL << 30, PMD = _2MB, @@ -235,6 +236,11 @@ static void *get_source_mapping(struct config c) unsigned long long mmap_min_addr; mmap_min_addr = get_mmap_min_addr(); + /* + * For some tests, we need to not have any mappings below the + * source mapping. Add some headroom to mmap_min_addr for this. + */ + mmap_min_addr += 10 * _4MB; retry: addr += c.src_alignment; @@ -434,7 +440,7 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb, return 0; } -#define MAX_TEST 13 +#define MAX_TEST 14 #define MAX_PERF_TEST 3 int main(int argc, char **argv) { @@ -500,6 +506,10 @@ int main(int argc, char **argv) test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + /* Src and Dest addr 1MB aligned. 5MB mremap. */ + test_cases[13] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "5MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); /* -- cgit v1.2.3 From a4cb3b2433434377ae820ac4adcde1a570c6f332 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:26 +0000 Subject: selftests: mm: add a test for remapping to area immediately after existing mapping This patch adds support for verifying that we correctly handle the situation where something is already mapped before the destination of the remap. Any realignment of destination address and PMD-copy will destroy that existing mapping. In such cases, we need to avoid doing the optimization. To test this, we map an area called the preamble before the remap region. Then we verify after the mremap operation that this region did not get corrupted. Putting some prints in the kernel, I verified that we optimize correctly in different situations: Optimize when there is alignment and no previous mapping (this is tested by previous patch). can_align_down(old_vma->vm_start=2900000, old_addr=2900000, mask=-2097152): 0 can_align_down(new_vma->vm_start=2f00000, new_addr=2f00000, mask=-2097152): 0 === Starting move_page_tables === Doing PUD move for 2800000 -> 2e00000 of extent=200000 <-- Optimization Doing PUD move for 2a00000 -> 3000000 of extent=200000 Doing PUD move for 2c00000 -> 3200000 of extent=200000 Don't optimize when there is alignment but there is previous mapping (this is tested by this patch). Notice that can_align_down() returns 1 for the destination mapping as we detected there is something there. can_align_down(old_vma->vm_start=2900000, old_addr=2900000, mask=-2097152): 0 can_align_down(new_vma->vm_start=5700000, new_addr=5700000, mask=-2097152): 1 === Starting move_page_tables === Doing move_ptes for 2900000 -> 5700000 of extent=100000 <-- Unoptimized Doing PUD move for 2a00000 -> 5800000 of extent=200000 Doing PUD move for 2c00000 -> 5a00000 of extent=200000 Link: https://lkml.kernel.org/r/20230903151328.2981432-6-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 57 +++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 6304eb0947a3..d7366074e2a8 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -29,6 +29,7 @@ struct config { unsigned long long dest_alignment; unsigned long long region_size; int overlapping; + int dest_preamble_size; }; struct test { @@ -283,7 +284,7 @@ error: static long long remap_region(struct config c, unsigned int threshold_mb, char pattern_seed) { - void *addr, *src_addr, *dest_addr; + void *addr, *src_addr, *dest_addr, *dest_preamble_addr; unsigned long long i; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -300,7 +301,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, goto out; } - /* Set byte pattern */ + /* Set byte pattern for source block. */ srand(pattern_seed); for (i = 0; i < threshold; i++) memset((char *) src_addr + i, (char) rand(), 1); @@ -312,6 +313,9 @@ static long long remap_region(struct config c, unsigned int threshold_mb, addr = (void *) (((unsigned long long) src_addr + c.region_size + offset) & align_mask); + /* Remap after the destination block preamble. */ + addr += c.dest_preamble_size; + /* See comment in get_source_mapping() */ if (!((unsigned long long) addr & c.dest_alignment)) addr = (void *) ((unsigned long long) addr | c.dest_alignment); @@ -327,6 +331,24 @@ static long long remap_region(struct config c, unsigned int threshold_mb, addr += c.dest_alignment; } + if (c.dest_preamble_size) { + dest_preamble_addr = mmap((void *) addr - c.dest_preamble_size, c.dest_preamble_size, + PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (dest_preamble_addr == MAP_FAILED) { + ksft_print_msg("Failed to map dest preamble region: %s\n", + strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Set byte pattern for the dest preamble block. */ + srand(pattern_seed); + for (i = 0; i < c.dest_preamble_size; i++) + memset((char *) dest_preamble_addr + i, (char) rand(), 1); + } + clock_gettime(CLOCK_MONOTONIC, &t_start); dest_addr = mremap(src_addr, c.region_size, c.region_size, MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); @@ -335,7 +357,7 @@ static long long remap_region(struct config c, unsigned int threshold_mb, if (dest_addr == MAP_FAILED) { ksft_print_msg("mremap failed: %s\n", strerror(errno)); ret = -1; - goto clean_up_src; + goto clean_up_dest_preamble; } /* Verify byte pattern after remapping */ @@ -353,6 +375,23 @@ static long long remap_region(struct config c, unsigned int threshold_mb, } } + /* Verify the dest preamble byte pattern after remapping */ + if (c.dest_preamble_size) { + srand(pattern_seed); + for (i = 0; i < c.dest_preamble_size; i++) { + char c = (char) rand(); + + if (((char *) dest_preamble_addr)[i] != c) { + ksft_print_msg("Preamble data after remap doesn't match at offset %d\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) dest_preamble_addr)[i] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + } + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; ret = end_ns - start_ns; @@ -365,6 +404,9 @@ static long long remap_region(struct config c, unsigned int threshold_mb, */ clean_up_dest: munmap(dest_addr, c.region_size); +clean_up_dest_preamble: + if (c.dest_preamble_size && dest_preamble_addr) + munmap(dest_preamble_addr, c.dest_preamble_size); clean_up_src: munmap(src_addr, c.region_size); out: @@ -440,7 +482,7 @@ static int parse_args(int argc, char **argv, unsigned int *threshold_mb, return 0; } -#define MAX_TEST 14 +#define MAX_TEST 15 #define MAX_PERF_TEST 3 int main(int argc, char **argv) { @@ -449,7 +491,7 @@ int main(int argc, char **argv) unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; int num_expand_tests = 2; - struct test test_cases[MAX_TEST]; + struct test test_cases[MAX_TEST] = {}; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; time_t t; @@ -510,6 +552,11 @@ int main(int argc, char **argv) test_cases[13] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS, "5MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + /* Src and Dest addr 1MB aligned. 5MB mremap. */ + test_cases[14] = MAKE_TEST(_1MB, _1MB, _5MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "5MB mremap - Source 1MB-aligned, Dest 1MB-aligned with 40MB Preamble"); + test_cases[14].config.dest_preamble_size = 10 * _4MB; + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); /* -- cgit v1.2.3 From 85a22845b094ec6eeb7c24c692f2290e56d9f07c Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:27 +0000 Subject: selftests: mm: add a test for remapping within a range Move a block of memory within a memory range. Any alignment optimization on the source address may cause corruption. Verify using kselftest that it works. I have also verified with tracing that such optimization does not happen due to this check in can_align_down(): if (!for_stack && vma->vm_start != addr_to_align) return false; Link: https://lkml.kernel.org/r/20230903151328.2981432-7-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 79 +++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index d7366074e2a8..12a095457f4c 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -23,6 +23,7 @@ #define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#define SIZE_MB(m) ((size_t)m * (1024 * 1024)) struct config { unsigned long long src_alignment; @@ -226,6 +227,79 @@ out: ksft_test_result_fail("%s\n", test_name); } +/* + * Verify that an mremap within a range does not cause corruption + * of unrelated part of range. + * + * Consider the following range which is 2MB aligned and is + * a part of a larger 20MB range which is not shown. Each + * character is 256KB below making the source and destination + * 2MB each. The lower case letters are moved (s to d) and the + * upper case letters are not moved. The below test verifies + * that the upper case S letters are not corrupted by the + * adjacent mremap. + * + * |DDDDddddSSSSssss| + */ +static void mremap_move_within_range(char pattern_seed) +{ + char *test_name = "mremap mremap move within range"; + void *src, *dest; + int i, success = 1; + + size_t size = SIZE_MB(20); + void *ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (ptr == MAP_FAILED) { + perror("mmap"); + success = 0; + goto out; + } + memset(ptr, 0, size); + + src = ptr + SIZE_MB(6); + src = (void *)((unsigned long)src & ~(SIZE_MB(2) - 1)); + + /* Set byte pattern for source block. */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(2); i++) { + ((char *)src)[i] = (char) rand(); + } + + dest = src - SIZE_MB(2); + + void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1), + MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1)); + if (new_ptr == MAP_FAILED) { + perror("mremap"); + success = 0; + goto out; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(1); i++) { + char c = (char) rand(); + + if (((char *)src)[i] != c) { + ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) src)[i] & 0xff); + success = 0; + } + } + +out: + if (munmap(ptr, size) == -1) + perror("munmap"); + + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -491,6 +565,7 @@ int main(int argc, char **argv) unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; int num_expand_tests = 2; + int num_misc_tests = 1; struct test test_cases[MAX_TEST] = {}; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -572,7 +647,7 @@ int main(int argc, char **argv) (threshold_mb * _1MB >= _1GB); ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests + num_misc_tests); for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, @@ -590,6 +665,8 @@ int main(int argc, char **argv) fclose(maps_fp); + mremap_move_within_range(pattern_seed); + if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); -- cgit v1.2.3 From 7b709f38dc0faf0d6dfeff681e37e59f9aabebd6 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 3 Sep 2023 15:13:28 +0000 Subject: selftests: mm: add a test for moving from an offset from start of mapping It is possible that the aligned address falls on no existing mapping, however that does not mean that we can just align it down to that. This test verifies that the "vma->vm_start != addr_to_align" check in can_align_down() prevents disastrous results if aligning down when source and dest are mutually aligned within a PMD but the source/dest addresses requested are not at the beginning of the respective mapping containing these addresses. Link: https://lkml.kernel.org/r/20230903151328.2981432-8-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mremap_test.c | 189 ++++++++++++++++++++++--------- 1 file changed, 134 insertions(+), 55 deletions(-) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index 12a095457f4c..1f836e670a37 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -24,6 +24,7 @@ #define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) #define SIZE_MB(m) ((size_t)m * (1024 * 1024)) +#define SIZE_KB(k) ((size_t)k * 1024) struct config { unsigned long long src_alignment; @@ -148,6 +149,60 @@ static bool is_range_mapped(FILE *maps_fp, void *start, void *end) return success; } +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + /* + * For some tests, we need to not have any mappings below the + * source mapping. Add some headroom to mmap_min_addr for this. + */ + mmap_min_addr += 10 * _4MB; + +retry: + addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM || errno == EEXIST) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); + goto retry; + } + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + /* * This test validates that merge is called when expanding a mapping. * Mapping containing three pages is created, middle page is unmapped @@ -300,60 +355,6 @@ out: ksft_test_result_fail("%s\n", test_name); } -/* - * Returns the start address of the mapping on success, else returns - * NULL on failure. - */ -static void *get_source_mapping(struct config c) -{ - unsigned long long addr = 0ULL; - void *src_addr = NULL; - unsigned long long mmap_min_addr; - - mmap_min_addr = get_mmap_min_addr(); - /* - * For some tests, we need to not have any mappings below the - * source mapping. Add some headroom to mmap_min_addr for this. - */ - mmap_min_addr += 10 * _4MB; - -retry: - addr += c.src_alignment; - if (addr < mmap_min_addr) - goto retry; - - src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - if (src_addr == MAP_FAILED) { - if (errno == EPERM || errno == EEXIST) - goto retry; - goto error; - } - /* - * Check that the address is aligned to the specified alignment. - * Addresses which have alignments that are multiples of that - * specified are not considered valid. For instance, 1GB address is - * 2MB-aligned, however it will not be considered valid for a - * requested alignment of 2MB. This is done to reduce coincidental - * alignment in the tests. - */ - if (((unsigned long long) src_addr & (c.src_alignment - 1)) || - !((unsigned long long) src_addr & c.src_alignment)) { - munmap(src_addr, c.region_size); - goto retry; - } - - if (!src_addr) - goto error; - - return src_addr; -error: - ksft_print_msg("Failed to map source region: %s\n", - strerror(errno)); - return NULL; -} - /* Returns the time taken for the remap on success else returns -1. */ static long long remap_region(struct config c, unsigned int threshold_mb, char pattern_seed) @@ -487,6 +488,83 @@ out: return ret; } +/* + * Verify that an mremap aligning down does not destroy + * the beginning of the mapping just because the aligned + * down address landed on a mapping that maybe does not exist. + */ +static void mremap_move_1mb_from_start(char pattern_seed) +{ + char *test_name = "mremap move 1mb from start at 1MB+256KB aligned src"; + void *src = NULL, *dest = NULL; + int i, success = 1; + + /* Config to reuse get_source_mapping() to do an aligned mmap. */ + struct config c = { + .src_alignment = SIZE_MB(1) + SIZE_KB(256), + .region_size = SIZE_MB(6) + }; + + src = get_source_mapping(c); + if (!src) { + success = 0; + goto out; + } + + c.src_alignment = SIZE_MB(1) + SIZE_KB(256); + dest = get_source_mapping(c); + if (!dest) { + success = 0; + goto out; + } + + /* Set byte pattern for source block. */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(2); i++) { + ((char *)src)[i] = (char) rand(); + } + + /* + * Unmap the beginning of dest so that the aligned address + * falls on no mapping. + */ + munmap(dest, SIZE_MB(1)); + + void *new_ptr = mremap(src + SIZE_MB(1), SIZE_MB(1), SIZE_MB(1), + MREMAP_MAYMOVE | MREMAP_FIXED, dest + SIZE_MB(1)); + if (new_ptr == MAP_FAILED) { + perror("mremap"); + success = 0; + goto out; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < SIZE_MB(1); i++) { + char c = (char) rand(); + + if (((char *)src)[i] != c) { + ksft_print_msg("Data at src at %d got corrupted due to unrelated mremap\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) src)[i] & 0xff); + success = 0; + } + } + +out: + if (src && munmap(src, c.region_size) == -1) + perror("munmap src"); + + if (dest && munmap(dest, c.region_size) == -1) + perror("munmap dest"); + + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + static void run_mremap_test_case(struct test test_case, int *failures, unsigned int threshold_mb, unsigned int pattern_seed) @@ -565,7 +643,7 @@ int main(int argc, char **argv) unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; int num_expand_tests = 2; - int num_misc_tests = 1; + int num_misc_tests = 2; struct test test_cases[MAX_TEST] = {}; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -666,6 +744,7 @@ int main(int argc, char **argv) fclose(maps_fp); mremap_move_within_range(pattern_seed); + mremap_move_1mb_from_start(pattern_seed); if (run_perf_tests) { ksft_print_msg("\n%s\n", -- cgit v1.2.3 From 64d4d49c5f3bd9a8a24f52d1e9d16af1b368114b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 8 Sep 2023 16:51:15 -0700 Subject: zswap: change zswap's default allocator to zsmalloc Out of zswap's 3 allocators, zsmalloc is the clear superior in terms of memory utilization, both in theory and as observed in practice, with its high storage density and low internal fragmentation. zsmalloc is also more actively developed and maintained, since it is the allocator of choice for zswap for many users, as well as the only allocator for zram. A historical objection to the selection of zsmalloc as the default allocator for zswap is its lack of writeback capability. However, this has changed, with the zsmalloc writeback patchset, and the subsequent zswap LRU refactor. With this, there is not a lot of good reasons to keep zbud, an otherwise inferior allocator, as the default instead of zswap. This patch changes the default allocator to zsmalloc. The only exception is on settings without MMU, in which case zbud will remain as the default. Link: https://lkml.kernel.org/r/20230908235115.2943486-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5..45f2971ef642 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -130,6 +130,7 @@ config ZSWAP_COMPRESSOR_DEFAULT choice prompt "Default allocator" depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU default ZSWAP_ZPOOL_DEFAULT_ZBUD help Selects the default allocator for the compressed cache for -- cgit v1.2.3 From b4c078004a435712fb0f61e1ac84b4832ecaecdc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:19 +0000 Subject: Docs/admin-guide/mm/damon/usage: fixup missed :ref: keyword Patch series "mm/damon: misc fixups for documents, comments and its tracepoint". This patchset contains miscellaneous simple fixups for documents, comments and tracepoint of DAMON. This patch (of 11): A cross-link reference in DAMON usage document is missing ':ref:' Sphynx keyword. Fix it. Link: https://lkml.kernel.org/r/20230907022929.91361-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230907022929.91361-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 8da1b7281827..834eefe39650 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -392,7 +392,7 @@ pages of all memory cgroups except ``/having_care_already``.:: echo N > 1/matching Note that ``anon`` and ``memcg`` filters are currently supported only when -``paddr`` `implementation ` is being used. +``paddr`` :ref:`implementation ` is being used. Also, memory regions that are filtered out by ``addr`` or ``target`` filters are not counted as the scheme has tried to those, while regions that filtered -- cgit v1.2.3 From 75999724ba3ffcbda571ec11272644b76cfd9b9f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:20 +0000 Subject: Docs/admin-guide/mm/damon/usage: place debugfs usage at the bottom debugfs interface is deprecated. Put it at the bottom of the document so that readers have less chances to read it. Link: https://lkml.kernel.org/r/20230907022929.91361-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 39 ++++++++++++++-------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 834eefe39650..bffe9dd9b0d6 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -495,6 +495,25 @@ Please note that it's highly recommended to use user space tools like `damo `_ rather than manually reading and writing the files as above. Above is only for an example. +.. _tracepoint: + +Tracepoint for Monitoring Results +================================= + +Users can get the monitoring results via the :ref:`tried_regions +` or a tracepoint, ``damon:damon_aggregated``. +While the tried regions directory is useful for getting a snapshot, the +tracepoint is useful for getting a full record of the results. While the +monitoring is turned on, you could record the tracepoint events and show +results using tracepoint supporting tools like ``perf``. For example:: + + # echo on > monitor_on + # perf record -e damon:damon_aggregated & + # sleep 5 + # kill 9 $(pidof perf) + # echo off > monitor_on + # perf script + .. _debugfs_interface: debugfs Interface (DEPRECATED!) @@ -790,23 +809,3 @@ directory by putting the name of the context to the ``rm_contexts`` file. :: Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the root directory only. - - -.. _tracepoint: - -Tracepoint for Monitoring Results -================================= - -Users can get the monitoring results via the :ref:`tried_regions -` or a tracepoint, ``damon:damon_aggregated``. -While the tried regions directory is useful for getting a snapshot, the -tracepoint is useful for getting a full record of the results. While the -monitoring is turned on, you could record the tracepoint events and show -results using tracepoint supporting tools like ``perf``. For example:: - - # echo on > monitor_on - # perf record -e damon:damon_aggregated & - # sleep 5 - # kill 9 $(pidof perf) - # echo off > monitor_on - # perf script -- cgit v1.2.3 From 4f7112786e5c670fed2787d4f80291aa55c4a26c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:21 +0000 Subject: Docs/admin-guide/mm/damon/usage: move debugfs intro to the bottom of the section On the DAMON usage introduction section, the introduction of DAMON debugfs interface, which is deprecated, is above kernel API, which is actively supported. Move the DAMON debugfs intro to bottom, so that readers have less chances to read it. Link: https://lkml.kernel.org/r/20230907022929.91361-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index bffe9dd9b0d6..e48101c777e1 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -20,18 +20,18 @@ DAMON provides below interfaces for different users. you can write and use your personalized DAMON sysfs wrapper programs that reads/writes the sysfs files instead of you. The `DAMON user space tool `_ is one example of such programs. -- *debugfs interface. (DEPRECATED!)* - :ref:`This ` is almost identical to :ref:`sysfs interface - `. This is deprecated, so users should move to the - :ref:`sysfs interface `. If you depend on this and cannot - move, please report your usecase to damon@lists.linux.dev and - linux-mm@kvack.org. - *Kernel Space Programming Interface.* :doc:`This ` is for kernel space programmers. Using this, users can utilize every feature of DAMON most flexibly and efficiently by writing kernel space DAMON application programs for you. You can even extend DAMON for various address spaces. For detail, please refer to the interface :doc:`document `. +- *debugfs interface. (DEPRECATED!)* + :ref:`This ` is almost identical to :ref:`sysfs interface + `. This is deprecated, so users should move to the + :ref:`sysfs interface `. If you depend on this and cannot + move, please report your usecase to damon@lists.linux.dev and + linux-mm@kvack.org. .. _sysfs_interface: -- cgit v1.2.3 From 24df886f9f32087ba70d95a03a11c23db171a756 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:22 +0000 Subject: Docs/mm/damon/design: explicitly introduce ``nr_accesses`` The design document is explaining about the access tracking mechanism and the access rate counter (nr_accesses), but not directly mentions the name. Add a sentence for making it clear. Link: https://lkml.kernel.org/r/20230907022929.91361-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index a20383d01a95..5c465835a44f 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -163,9 +163,10 @@ assumption (pages in a region have the same access frequencies) is kept, only one page in the region is required to be checked. Thus, for each ``sampling interval``, DAMON randomly picks one page in each region, waits for one ``sampling interval``, checks whether the page is accessed meanwhile, and -increases the access frequency of the region if so. Therefore, the monitoring -overhead is controllable by setting the number of regions. DAMON allows users -to set the minimum and the maximum number of regions for the trade-off. +increases the access frequency counter of the region if so. The counter is +called ``nr_regions`` of the region. Therefore, the monitoring overhead is +controllable by setting the number of regions. DAMON allows users to set the +minimum and the maximum number of regions for the trade-off. This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. -- cgit v1.2.3 From 4f554ca15ad24022178b954e03d73502a98d6d44 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:23 +0000 Subject: Docs/admin-guide/mm/damon/usage: explain the format of damon_aggregate tracepoint The example of the section for damon_aggregated tracepoint is not explaining how the output looks like, and how it can be interpreted. Add it. Link: https://lkml.kernel.org/r/20230907022929.91361-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 14 ++++++++++++++ Documentation/mm/damon/design.rst | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index e48101c777e1..758575d33ab6 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -513,6 +513,20 @@ results using tracepoint supporting tools like ``perf``. For example:: # kill 9 $(pidof perf) # echo off > monitor_on # perf script + kdamond.0 46568 [027] 79357.842179: damon:damon_aggregated: target_id=0 nr_regions=11 122509119488-135708762112: 0 864 + [...] + +Each line of the perf script output represents each monitoring region. The +first five fields are as usual other tracepoint outputs. The sixth field +(``target_id=X``) shows the ide of the monitoring target of the region. The +seventh field (``nr_regions=X``) shows the total number of monitoring regions +for the target. The eighth field (``X-Y:``) shows the start (``X``) and end +(``Y``) addresses of the region in bytes. The ninth field (``X``) shows the +``nr_accesses`` of the region (refer to +:ref:`design ` for more details of the +counter). Finally the tenth field (``X``) shows the ``age`` of the region +(refer to :ref:`design ` for more details of the +counter). .. _debugfs_interface: diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 5c465835a44f..51aab9b0621a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -154,6 +154,8 @@ The monitoring overhead of this mechanism will arbitrarily increase as the size of the target workload grows. +.. _damon_design_region_based_sampling: + Region Based Sampling ~~~~~~~~~~~~~~~~~~~~~ @@ -191,6 +193,8 @@ In this way, DAMON provides its best-effort quality and minimal overhead while keeping the bounds users set for their trade-off. +.. _damon_design_age_tracking: + Age Tracking ~~~~~~~~~~~~ -- cgit v1.2.3 From 86ae64cde2f7165a20e2824431cec8e820a0b689 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:24 +0000 Subject: Docs/mm/damon/design: add a section for kdamond and DAMON context The design document is not explaining about the concept of kdamond and the DAMON context, while usage document does. Those concept explanation should be in the design document, and usage document should link those. Add a section for those. Link: https://lkml.kernel.org/r/20230907022929.91361-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 51aab9b0621a..ee099d45fea8 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -476,3 +476,13 @@ modules for proactive reclamation and LRU lists manipulation are provided. For more detail, please read the usage documents for those (:doc:`/admin-guide/mm/damon/reclaim` and :doc:`/admin-guide/mm/damon/lru_sort`). + + +Execution Model and Data Structures +=================================== + +The monitoring-related information including the monitoring request +specification and DAMON-based operation schemes are stored in a data structure +called DAMON ``context``. DAMON executes each context with a kernel thread +called ``kdamond``. Multiple kdamonds could run in parallel, for different +types of monitoring. -- cgit v1.2.3 From 46158bf211bd557cb7849ff747a5a94d82cd4aa8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:25 +0000 Subject: Docs/admin-guide/mm/damon/usage: link design doc for details of kdamond and context The explanation of kdamond and context is duplicated in the design and the usage documents. Replace that in the usage with links to those in the design document. Link: https://lkml.kernel.org/r/20230907022929.91361-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 19 +++++++++---------- Documentation/mm/damon/design.rst | 2 ++ 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 758575d33ab6..282062b6f134 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -105,14 +105,12 @@ having the root permission could use this directory. kdamonds/ --------- -The monitoring-related information including request specifications and results -are called DAMON context. DAMON executes each context with a kernel thread -called kdamond, and multiple kdamonds could run in parallel. - Under the ``admin`` directory, one directory, ``kdamonds``, which has files for -controlling the kdamonds exist. In the beginning, this directory has only one -file, ``nr_kdamonds``. Writing a number (``N``) to the file creates the number -of child directories named ``0`` to ``N-1``. Each directory represents each +controlling the kdamonds (refer to +:ref:`design ` for more +details) exists. In the beginning, this directory has only one file, +``nr_kdamonds``. Writing a number (``N``) to the file creates the number of +child directories named ``0`` to ``N-1``. Each directory represents each kdamond. kdamonds// @@ -150,9 +148,10 @@ kdamonds//contexts/ In the beginning, this directory has only one file, ``nr_contexts``. Writing a number (``N``) to the file creates the number of child directories named as -``0`` to ``N-1``. Each directory represents each monitoring context. At the -moment, only one context per kdamond is supported, so only ``0`` or ``1`` can -be written to the file. +``0`` to ``N-1``. Each directory represents each monitoring context (refer to +:ref:`design ` for more +details). At the moment, only one context per kdamond is supported, so only +``0`` or ``1`` can be written to the file. .. _sysfs_contexts: diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index ee099d45fea8..18e9b42673f8 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -478,6 +478,8 @@ more detail, please read the usage documents for those :doc:`/admin-guide/mm/damon/lru_sort`). +.. _damon_design_execution_model_and_data_structures: + Execution Model and Data Structures =================================== -- cgit v1.2.3 From 27e68c4b0d5a945d975140315332ea2e7aa2c57b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:26 +0000 Subject: mm/damon/core: fix a comment about damon_set_attrs() call timings The comment on damon_set_attrs() says it should not be called while the kdamond is running, but now some DAMON modules like sysfs interface and DAMON_RECLAIM call it from after_aggregation() and/or after_wmarks_check() callbacks for online tuning. Update the comment. Link: https://lkml.kernel.org/r/20230907022929.91361-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index bcd2bd9d6c10..9160a0674aff 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -541,7 +541,11 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should not be called while the kdamond is running. + * This function should be called while the kdamond is not running, or an + * access check results aggregation is not ongoing (e.g., from + * &struct damon_callback->after_aggregation or + * &struct damon_callback->after_wmarks_check callbacks). + * * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. -- cgit v1.2.3 From d896073fc767ebb40c11a6a9de71c390757ac64b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:27 +0000 Subject: mm/damon/core: add more comments for nr_accesses The comment on struct damon_region about nr_accesses field looks not sufficient. Many people actually used to ask what nr_accesses mean. There is more detailed explanation of the mechanism on the comment for struct damon_attrs, but it is also ambiguous, as it doesn't specify the name of the counter for aggregating the access check results. Make those more detailed. Link: https://lkml.kernel.org/r/20230907022929.91361-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index ae2664d1d5f1..266f92b34dd2 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -43,6 +43,10 @@ struct damon_addr_range { * @list: List head for siblings. * @age: Age of this region. * + * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be + * increased for every &damon_attrs->sample_interval if an access to the region + * during the last sampling interval is found. + * * @age is initially zero, increased for each aggregation interval, and reset * to zero again if the access frequency is significantly changed. If two * regions are merged into a new region, both @nr_accesses and @age of the new @@ -472,13 +476,14 @@ struct damon_callback { * regions. * * For each @sample_interval, DAMON checks whether each region is accessed or - * not. It aggregates and keeps the access information (number of accesses to - * each region) for @aggr_interval time. DAMON also checks whether the target - * memory regions need update (e.g., by ``mmap()`` calls from the application, - * in case of virtual memory monitoring) and applies the changes for each - * @ops_update_interval. All time intervals are in micro-seconds. - * Please refer to &struct damon_operations and &struct damon_callback for more - * detail. + * not during the last @sample_interval. If such access is found, DAMON + * aggregates the information by increasing &damon_region->nr_accesses for + * @aggr_interval time. For each @aggr_interval, the count is reset. DAMON + * also checks whether the target memory regions need update (e.g., by + * ``mmap()`` calls from the application, in case of virtual memory monitoring) + * and applies the changes for each @ops_update_interval. All time intervals + * are in micro-seconds. Please refer to &struct damon_operations and &struct + * damon_callback for more detail. */ struct damon_attrs { unsigned long sample_interval; -- cgit v1.2.3 From cf0a96bd3ab4d9d8a1c92baf1a822f2ddbca3a34 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:28 +0000 Subject: mm/damon/core: remove duplicated comment for watermarks-based deactivation The comment for explaining about watermarks-based monitoring part deactivation is duplicated in two paragraphs. Remove one. Link: https://lkml.kernel.org/r/20230907022929.91361-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 266f92b34dd2..ab3089de1478 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -317,9 +317,6 @@ struct damos_access_pattern { * monitoring context are inactive, DAMON stops monitoring either, and just * repeatedly checks the watermarks. * - * If all schemes that registered to a &struct damon_ctx are inactive, DAMON - * stops monitoring and just repeatedly checks the watermarks. - * * Before applying the &action to a memory region, &struct damon_operations * implementation could check pages of the region and skip &action to respect * &filters -- cgit v1.2.3 From 2d00946bd7f4e8c17cbd2fce5fd7c3ab58046dff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 7 Sep 2023 02:29:29 +0000 Subject: mm/damon/core: remove 'struct target *' parameter from damon_aggregated tracepoint damon_aggregateed tracepoint is receiving 'struct target *', but doesn't use it. Remove it from the prototype. Link: https://lkml.kernel.org/r/20230907022929.91361-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 6 +++--- mm/damon/core.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index c79f1d4c39af..0b8d13bde17a 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -11,10 +11,10 @@ TRACE_EVENT(damon_aggregated, - TP_PROTO(struct damon_target *t, unsigned int target_id, - struct damon_region *r, unsigned int nr_regions), + TP_PROTO(unsigned int target_id, struct damon_region *r, + unsigned int nr_regions), - TP_ARGS(t, target_id, r, nr_regions), + TP_ARGS(target_id, r, nr_regions), TP_STRUCT__entry( __field(unsigned long, target_id) diff --git a/mm/damon/core.c b/mm/damon/core.c index 9160a0674aff..ca631dd88b33 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -776,7 +776,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); + trace_damon_aggregated(ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } -- cgit v1.2.3 From 7fa38d0ea00ffe2cd3c95c96c85221b8ae221875 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Wed, 6 Sep 2023 10:33:12 +0000 Subject: mm: remove duplicated vma->vm_flags check when expanding stack expand_upwards() and expand_downwards() will return -EFAULT if VM_GROWSUP or VM_GROWSDOWN is not correctly set in vma->vm_flags, however in !CONFIG_STACK_GROWSUP case, expand_stack_locked() returns -EINVAL first if !(vma->vm_flags & VM_GROWSDOWN) before calling expand_downwards(), to keep the consistency with CONFIG_STACK_GROWSUP case, remove this check. The usages of this function are as below: A:fs/exec.c ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; or B:mm/memory.c mm/mmap.c if (expand_stack_locked(vma, addr)) return NULL; which means the return value will not propagate to other places, so I believe there is no user-visible effects of this change, and it's unnecessary to backport to earlier versions. Link: https://lkml.kernel.org/r/20230906103312.645712-1-xiujianfeng@huaweicloud.com Fixes: f440fa1ac955 ("mm: make find_extend_vma() fail if write lock not held") Signed-off-by: Xiu Jianfeng Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1ec6c875a7f9..42870c4d8704 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2159,8 +2159,6 @@ struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned lon #else int expand_stack_locked(struct vm_area_struct *vma, unsigned long address) { - if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) - return -EINVAL; return expand_downwards(vma, address); } -- cgit v1.2.3 From 6a898c2757af1ac852bb917a0866d2724f303076 Mon Sep 17 00:00:00 2001 From: Yuan Can Date: Wed, 6 Sep 2023 17:31:57 +0800 Subject: mm: hugetlb_vmemmap: allow alloc vmemmap pages fallback to other nodes In vmemmap_remap_free(), a new head vmemmap page is allocated to avoid breaking a contiguous block of struct page memory, however, the allocation can always fail when the given node is movable node. Remove the __GFP_THISNODE to help avoid fragmentation. Link: https://lkml.kernel.org/r/20230906093157.9737-1-yuancan@huawei.com Signed-off-by: Yuan Can Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Suggested-by: Muchun Song Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 3e5387835ad6..0bde38626d25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -320,8 +320,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .vmemmap_pages = &vmemmap_pages, }; int nid = page_to_nid((struct page *)start); - gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | - __GFP_NOWARN; + gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous -- cgit v1.2.3 From 40dca9b3d65a20c4b7a48252ece572132a299302 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Wed, 6 Sep 2023 17:11:13 +0800 Subject: mm/mm_init.c: remove redundant pr_info when node is memoryless There is a similar pr_info in free_area_init_node(), so remove the redundant pr_info. before: [ 0.006314] Initializing node 0 as memoryless [ 0.006445] Initmem setup node 0 as memoryless [ 0.006450] Initmem setup node 1 [mem 0x0000000000001000-0x000000003fffffff] [ 0.006453] Initmem setup node 2 [mem 0x0000000040000000-0x000000007ffd7fff] [ 0.006454] Initializing node 3 as memoryless [ 0.006584] Initmem setup node 3 as memoryless [ 0.006585] Initmem setup node 4 [mem 0x0000000100000000-0x00000001bfffffff] [ 0.006586] Initmem setup node 5 [mem 0x00000001c0000000-0x00000001ffffffff] [ 0.006587] Initmem setup node 6 [mem 0x0000000200000000-0x000000023fffffff] after: [ 0.004147] Initmem setup node 0 as memoryless [ 0.004148] Initmem setup node 1 [mem 0x0000000000001000-0x000000003fffffff] [ 0.004150] Initmem setup node 2 [mem 0x0000000040000000-0x000000007ffd7fff] [ 0.004154] Initmem setup node 3 as memoryless [ 0.004155] Initmem setup node 4 [mem 0x0000000100000000-0x00000001bfffffff] [ 0.004156] Initmem setup node 5 [mem 0x00000001c0000000-0x00000001ffffffff] [ 0.004157] Initmem setup node 6 [mem 0x0000000200000000-0x000000023fffffff] Link: https://lkml.kernel.org/r/20230906091113.4029983-1-yajun.deng@linux.dev Signed-off-by: Yajun Deng Reviewed-by: David Hildenbrand Cc: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 50f2f34745af..6be6f50813b1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1871,8 +1871,6 @@ void __init free_area_init(unsigned long *max_zone_pfn) pg_data_t *pgdat; if (!node_online(nid)) { - pr_info("Initializing node %d as memoryless\n", nid); - /* Allocator not initialized yet */ pgdat = arch_alloc_nodedata(nid); if (!pgdat) -- cgit v1.2.3 From 037dd8f9024bdd5540e2b66eba56c279f740b23c Mon Sep 17 00:00:00 2001 From: Angus Chen Date: Wed, 6 Sep 2023 16:37:00 +0800 Subject: mm/vmscan: print err before panic If panic is enable,the err information will not be printed before bugon, So swap it. Print the return value of PTR_ERR(pgdat->kswapd) also. Link: https://lkml.kernel.org/r/20230906083700.181-1-angus.chen@jaguarmicro.com Signed-off-by: Angus Chen Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 93fdeadea896..4062bcca5311 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7892,8 +7892,9 @@ void __meminit kswapd_run(int nid) pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); if (IS_ERR(pgdat->kswapd)) { /* failure at boot is fatal */ + pr_err("Failed to start kswapd on node %d,ret=%ld\n", + nid, PTR_ERR(pgdat->kswapd)); BUG_ON(system_state < SYSTEM_RUNNING); - pr_err("Failed to start kswapd on node %d\n", nid); pgdat->kswapd = NULL; } } -- cgit v1.2.3 From 84e8e54e2ed9d857c4e10a7e487532203385249c Mon Sep 17 00:00:00 2001 From: Ying Sun Date: Wed, 6 Sep 2023 12:50:12 +0800 Subject: mm/shmem: remove dead code can not be satisfied by "(CONFIG_SHMEM)&&(!(CONFIG_SHMEM))" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The value of “.fs_flags” in line 4608 is a dead code which will never be implemented,because its conditions of line 47 "#ifdef CONFIG_SHMEM" and line 4607 are mutually exclusive. It is recommended to delete redundant code. Link: https://lkml.kernel.org/r/20230906045012.14999-1-sunying@nj.iscas.ac.cn Signed-off-by: Ying Sun Suggested-by: Yanjie Ren Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 69595d341882..8539d39ec9a3 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4585,11 +4585,7 @@ static struct file_system_type shmem_fs_type = { .parameters = shmem_fs_parameters, #endif .kill_sb = kill_litter_super, -#ifdef CONFIG_SHMEM .fs_flags = FS_USERNS_MOUNT | FS_ALLOW_IDMAP, -#else - .fs_flags = FS_USERNS_MOUNT, -#endif }; void __init shmem_init(void) -- cgit v1.2.3 From 1717449b4417da2d24e2f9db95239e85f1317002 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 12 Sep 2023 01:17:20 +1000 Subject: memfd: drop warning for missing exec-related flags Commit 434ed3350f57 ("memfd: improve userspace warnings for missing exec-related flags") attempted to make these warnings more useful (so they would work as an incentive to get users to switch to specifying these flags -- as intended by the original MFD_NOEXEC_SEAL patchset). Unfortunately, it turns out that even INFO-level logging is too extreme to enable by default and alternative solutions to the spam issue (such as doing more extreme rate-limiting per-task) are either too ugly or overkill for something as simple as emitting a log as a developer aid. Given that the flags are new and there is no harm to not specifying them (after all, we maintain backwards compatibility) we can just drop the warnings for now until some time in the future when most programs have migrated and distributions start using vm.memfd_noexec=1 (where failing to pass the flag would result in unexpected errors for programs that use executable memfds). Link: https://lkml.kernel.org/r/20230912-memfd-reduce-spam-v2-1-7d92a4964b6a@cyphar.com Fixes: 434ed3350f57 ("memfd: improve userspace warnings for missing exec-related flags") Fixes: 2562d67b1bdf ("revert "memfd: improve userspace warnings for missing exec-related flags".") Signed-off-by: Aleksa Sarai Reported-by: Damian Tometzki Reviewed-by: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/memfd.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 2dba2cb6f0d0..d3a1ba4208c9 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -315,12 +315,6 @@ SYSCALL_DEFINE2(memfd_create, if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL)) return -EINVAL; - if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - pr_warn_once( - "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", - current->comm, task_pid_nr(current)); - } - error = check_sysctl_memfd_noexec(&flags); if (error < 0) return error; -- cgit v1.2.3 From be1ab60eb04f27e559d8be9655beb04d2bc90b25 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:56:59 +0200 Subject: kmsan: simplify kmsan_internal_memmove_metadata() kmsan_internal_memmove_metadata() is the function that implements copying metadata every time memcpy()/memmove() is called. Because shadow memory stores 1 byte per each byte of kernel memory, copying the shadow is trivial and can be done by a single memmove() call. Origins, on the other hand, are stored as 4-byte values corresponding to every aligned 4 bytes of kernel memory. Therefore, if either the source or the destination of kmsan_internal_memmove_metadata() is unaligned, the number of origin slots corresponding to the source or destination may differ: 1) memcpy(0xffff888080a00000, 0xffff888080900000, 4) copies 1 origin slot into 1 origin slot: src (0xffff888080900000): xxxx src origins: o111 dst (0xffff888080a00000): xxxx dst origins: o111 2) memcpy(0xffff888080a00001, 0xffff888080900000, 4) copies 1 origin slot into 2 origin slots: src (0xffff888080900000): xxxx src origins: o111 dst (0xffff888080a00000): .xxx x... dst origins: o111 o111 3) memcpy(0xffff888080a00000, 0xffff888080900001, 4) copies 2 origin slots into 1 origin slot: src (0xffff888080900000): .xxx x... src origins: o111 o222 dst (0xffff888080a00000): xxxx dst origins: o111 (or o222) Previously, kmsan_internal_memmove_metadata() tried to solve this problem by copying min(src_slots, dst_slots) as is and cloning the missing slot on one of the ends, if needed. This was error-prone even in the simple cases where 4 bytes were copied, and did not account for situations where the total number of nonzero origin slots could have increased by more than one after copying: memcpy(0xffff888080a00000, 0xffff888080900002, 8) src (0xffff888080900002): ..xx .... xx.. src origins: o111 0000 o222 dst (0xffff888080a00000): xx.. ..xx o111 0000 (or 0000 o222) The new implementation simply copies the shadow byte by byte, and updates the corresponding origin slot, if the shadow byte is nonzero. This approach can handle complex cases with mixed initialized and uninitialized bytes. Similarly to KMSAN inline instrumentation, latter writes to bytes sharing the same origin slots take precedence. Link: https://lkml.kernel.org/r/20230911145702.2663753-1-glider@google.com Fixes: f80be4571b19 ("kmsan: add KMSAN runtime core") Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/core.c | 127 ++++++++++++++------------------------------------------ 1 file changed, 31 insertions(+), 96 deletions(-) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 3adb4c1d3b19..c19f47af0424 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -83,131 +83,66 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, /* Copy the metadata following the memmove() behavior. */ void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) { + depot_stack_handle_t prev_old_origin = 0, prev_new_origin = 0; + int i, iter, step, src_off, dst_off, oiter_src, oiter_dst; depot_stack_handle_t old_origin = 0, new_origin = 0; - int src_slots, dst_slots, i, iter, step, skip_bits; depot_stack_handle_t *origin_src, *origin_dst; - void *shadow_src, *shadow_dst; - u32 *align_shadow_src, shadow; + u8 *shadow_src, *shadow_dst; + u32 *align_shadow_dst; bool backwards; shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); if (!shadow_dst) return; KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + align_shadow_dst = + (u32 *)ALIGN_DOWN((u64)shadow_dst, KMSAN_ORIGIN_SIZE); shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); if (!shadow_src) { - /* - * @src is untracked: zero out destination shadow, ignore the - * origins, we're done. - */ - __memset(shadow_dst, 0, n); + /* @src is untracked: mark @dst as initialized. */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); return; } KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); - __memmove(shadow_dst, shadow_src, n); - origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); KMSAN_WARN_ON(!origin_dst || !origin_src); - src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - - ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / - KMSAN_ORIGIN_SIZE; - dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - - ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / - KMSAN_ORIGIN_SIZE; - KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); - KMSAN_WARN_ON((src_slots - dst_slots > 1) || - (dst_slots - src_slots < -1)); backwards = dst > src; - i = backwards ? min(src_slots, dst_slots) - 1 : 0; - iter = backwards ? -1 : 1; - - align_shadow_src = - (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); - for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { - KMSAN_WARN_ON(i < 0); - shadow = align_shadow_src[i]; - if (i == 0) { - /* - * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't - * look at the first @src % KMSAN_ORIGIN_SIZE bytes - * of the first shadow slot. - */ - skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow >> skip_bits) << skip_bits; + step = backwards ? -1 : 1; + iter = backwards ? n - 1 : 0; + src_off = (u64)src % KMSAN_ORIGIN_SIZE; + dst_off = (u64)dst % KMSAN_ORIGIN_SIZE; + + /* Copy shadow bytes one by one, updating the origins if necessary. */ + for (i = 0; i < n; i++, iter += step) { + oiter_src = (iter + src_off) / KMSAN_ORIGIN_SIZE; + oiter_dst = (iter + dst_off) / KMSAN_ORIGIN_SIZE; + if (!shadow_src[iter]) { + shadow_dst[iter] = 0; + if (!align_shadow_dst[oiter_dst]) + origin_dst[oiter_dst] = 0; + continue; } - if (i == src_slots - 1) { - /* - * If @src + n isn't aligned on - * KMSAN_ORIGIN_SIZE, don't look at the last - * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the - * last shadow slot. - */ - skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow << skip_bits) >> skip_bits; - } - /* - * Overwrite the origin only if the corresponding - * shadow is nonempty. - */ - if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { - old_origin = origin_src[i]; - new_origin = kmsan_internal_chain_origin(old_origin); + shadow_dst[iter] = shadow_src[iter]; + old_origin = origin_src[oiter_src]; + if (old_origin == prev_old_origin) + new_origin = prev_new_origin; + else { /* * kmsan_internal_chain_origin() may return * NULL, but we don't want to lose the previous * origin value. */ + new_origin = kmsan_internal_chain_origin(old_origin); if (!new_origin) new_origin = old_origin; } - if (shadow) - origin_dst[i] = new_origin; - else - origin_dst[i] = 0; - } - /* - * If dst_slots is greater than src_slots (i.e. - * dst_slots == src_slots + 1), there is an extra origin slot at the - * beginning or end of the destination buffer, for which we take the - * origin from the previous slot. - * This is only done if the part of the source shadow corresponding to - * slot is non-zero. - * - * E.g. if we copy 8 aligned bytes that are marked as uninitialized - * and have origins o111 and o222, to an unaligned buffer with offset 1, - * these two origins are copied to three origin slots, so one of then - * needs to be duplicated, depending on the copy direction (@backwards) - * - * src shadow: |uuuu|uuuu|....| - * src origin: |o111|o222|....| - * - * backwards = 0: - * dst shadow: |.uuu|uuuu|u...| - * dst origin: |....|o111|o222| - fill the empty slot with o111 - * backwards = 1: - * dst shadow: |.uuu|uuuu|u...| - * dst origin: |o111|o222|....| - fill the empty slot with o222 - */ - if (src_slots < dst_slots) { - if (backwards) { - shadow = align_shadow_src[src_slots - 1]; - skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow << skip_bits) >> skip_bits; - if (shadow) - /* src_slots > 0, therefore dst_slots is at least 2 */ - origin_dst[dst_slots - 1] = - origin_dst[dst_slots - 2]; - } else { - shadow = align_shadow_src[0]; - skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; - shadow = (shadow >> skip_bits) << skip_bits; - if (shadow) - origin_dst[0] = origin_dst[1]; - } + origin_dst[oiter_dst] = new_origin; + prev_new_origin = new_origin; + prev_old_origin = old_origin; } } -- cgit v1.2.3 From 0be7b2c232cfb1621660c78b0fd7260361c75e27 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:00 +0200 Subject: kmsan: prevent optimizations in memcpy tests Clang 18 learned to optimize away memcpy() calls of small uninitialized scalar values. To ensure that memcpy tests in kmsan_test.c still perform calls to memcpy() (which KMSAN replaces with __msan_memcpy()), declare a separate memcpy_noinline() function with volatile parameters, which won't be optimized. Also retire DO_NOT_OPTIMIZE(), as memcpy_noinline() is apparently enough. Link: https://lkml.kernel.org/r/20230911145702.2663753-2-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 312989aa2865..a8d4ca4a1066 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -407,33 +407,25 @@ static void test_printk(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } -/* - * Prevent the compiler from optimizing @var away. Without this, Clang may - * notice that @var is uninitialized and drop memcpy() calls that use it. - * - * There is OPTIMIZER_HIDE_VAR() in linux/compier.h that we cannot use here, - * because it is implemented as inline assembly receiving @var as a parameter - * and will enforce a KMSAN check. Same is true for e.g. barrier_data(var). - */ -#define DO_NOT_OPTIMIZE(var) barrier() +/* Prevent the compiler from inlining a memcpy() call. */ +static noinline void *memcpy_noinline(volatile void *dst, + const volatile void *src, size_t size) +{ + return memcpy((void *)dst, (const void *)src, size); +} -/* - * Test case: ensure that memcpy() correctly copies initialized values. - * Also serves as a regression test to ensure DO_NOT_OPTIMIZE() does not cause - * extra checks. - */ +/* Test case: ensure that memcpy() correctly copies initialized values. */ static void test_init_memcpy(struct kunit *test) { EXPECTATION_NO_REPORT(expect); - volatile int src; - volatile int dst = 0; + volatile long long src; + volatile long long dst = 0; - DO_NOT_OPTIMIZE(src); src = 1; kunit_info( test, "memcpy()ing aligned initialized src to aligned dst (no reports)\n"); - memcpy((void *)&dst, (void *)&src, sizeof(src)); + memcpy_noinline((void *)&dst, (void *)&src, sizeof(src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -451,8 +443,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -474,8 +465,9 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst[1], (void *)&uninit_src, + sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -498,8 +490,8 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); - DO_NOT_OPTIMIZE(uninit_src); - memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + memcpy_noinline((void *)&dst[1], (void *)&uninit_src, + sizeof(uninit_src)); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -513,7 +505,6 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) \ kunit_info(test, \ "memset" #size "() should initialize memory\n"); \ - DO_NOT_OPTIMIZE(uninit); \ memset##size((uint##size##_t *)&uninit, 0, 1); \ kmsan_check_memory((void *)&uninit, sizeof(uninit)); \ KUNIT_EXPECT_TRUE(test, report_matches(&expect)); \ -- cgit v1.2.3 From c3ab4873c8d3620493042123471e65b5e2c37b2c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:01 +0200 Subject: kmsan: merge test_memcpy_aligned_to_unaligned{,2}() together Introduce report_reset() that allows checking for more than one KMSAN report per testcase. Fold test_memcpy_aligned_to_unaligned2() into test_memcpy_aligned_to_unaligned(), so that they share the setup phase and check the behavior of a single memcpy() call. Link: https://lkml.kernel.org/r/20230911145702.2663753-3-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index a8d4ca4a1066..6eb1e1a4d08f 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -67,6 +67,17 @@ static bool report_available(void) return READ_ONCE(observed.available); } +/* Reset observed.available, so that the test can trigger another report. */ +static void report_reset(void) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + WRITE_ONCE(observed.available, false); + observed.ignore = false; + spin_unlock_irqrestore(&observed.lock, flags); +} + /* Information we expect in a report. */ struct expect_report { const char *error_type; /* Error type. */ @@ -454,7 +465,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) * * Copying aligned 4-byte value to an unaligned one leads to touching two * aligned 4-byte values. This test case checks that KMSAN correctly reports an - * error on the first of the two values. + * error on the mentioned two values. */ static void test_memcpy_aligned_to_unaligned(struct kunit *test) { @@ -470,28 +481,7 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); -} - -/* - * Test case: ensure that memcpy() correctly copies uninitialized values between - * aligned `src` and unaligned `dst`. - * - * Copying aligned 4-byte value to an unaligned one leads to touching two - * aligned 4-byte values. This test case checks that KMSAN correctly reports an - * error on the second of the two values. - */ -static void test_memcpy_aligned_to_unaligned2(struct kunit *test) -{ - EXPECTATION_UNINIT_VALUE_FN(expect, - "test_memcpy_aligned_to_unaligned2"); - volatile int uninit_src; - volatile char dst[8] = { 0 }; - - kunit_info( - test, - "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); - memcpy_noinline((void *)&dst[1], (void *)&uninit_src, - sizeof(uninit_src)); + report_reset(); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } @@ -589,7 +579,6 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_init_memcpy), KUNIT_CASE(test_memcpy_aligned_to_aligned), KUNIT_CASE(test_memcpy_aligned_to_unaligned), - KUNIT_CASE(test_memcpy_aligned_to_unaligned2), KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), -- cgit v1.2.3 From 46fa84a2b920f91780db3347de81d65efb668301 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 11 Sep 2023 16:57:02 +0200 Subject: kmsan: introduce test_memcpy_initialized_gap() Add a regression test for the special case where memcpy() previously failed to correctly set the origins: if upon memcpy() four aligned initialized bytes with a zero origin value ended up split between two aligned four-byte chunks, one of those chunks could've received the zero origin value even despite it contained uninitialized bytes from other writes. Link: https://lkml.kernel.org/r/20230911145702.2663753-4-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Marco Elver Acked-by: Marco Elver Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kmsan/kmsan_test.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 6eb1e1a4d08f..07d3a3a5a9c5 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -486,6 +486,58 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +/* + * Test case: ensure that origin slots do not accidentally get overwritten with + * zeroes during memcpy(). + * + * Previously, when copying memory from an aligned buffer to an unaligned one, + * if there were zero origins corresponding to zero shadow values in the source + * buffer, they could have ended up being copied to nonzero shadow values in the + * destination buffer: + * + * memcpy(0xffff888080a00000, 0xffff888080900002, 8) + * + * src (0xffff888080900002): ..xx .... xx.. + * src origins: o111 0000 o222 + * dst (0xffff888080a00000): xx.. ..xx + * dst origins: o111 0000 + * (or 0000 o222) + * + * (here . stands for an initialized byte, and x for an uninitialized one. + * + * Ensure that this does not happen anymore, and for both destination bytes + * the origin is nonzero (i.e. KMSAN reports an error). + */ +static void test_memcpy_initialized_gap(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_initialized_gap"); + volatile char uninit_src[12]; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "unaligned 4-byte initialized value gets a nonzero origin after memcpy() - (2 UMR reports)\n"); + + uninit_src[0] = 42; + uninit_src[1] = 42; + uninit_src[4] = 42; + uninit_src[5] = 42; + uninit_src[6] = 42; + uninit_src[7] = 42; + uninit_src[10] = 42; + uninit_src[11] = 42; + memcpy_noinline((void *)&dst[0], (void *)&uninit_src[2], 8); + + kmsan_check_memory((void *)&dst[0], 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); + report_reset(); + kmsan_check_memory((void *)&dst[2], 4); + KUNIT_EXPECT_FALSE(test, report_matches(&expect)); + report_reset(); + kmsan_check_memory((void *)&dst[4], 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + /* Generate test cases for memset16(), memset32(), memset64(). */ #define DEFINE_TEST_MEMSETXX(size) \ static void test_memset##size(struct kunit *test) \ @@ -579,6 +631,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_init_memcpy), KUNIT_CASE(test_memcpy_aligned_to_aligned), KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_initialized_gap), KUNIT_CASE(test_memset16), KUNIT_CASE(test_memset32), KUNIT_CASE(test_memset64), -- cgit v1.2.3 From 7cd34dd3c9bf1d67ce7d1ab3fe8886c583ae0d9a Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Sep 2023 14:21:13 +0300 Subject: efi/unaccepted: do not let /proc/vmcore try to access unaccepted memory Patch series "Do not try to access unaccepted memory", v2. Support for unaccepted memory was added recently, refer commit dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual machine may need to accept memory before it can be used. Plug a few gaps where RAM is exposed without checking if it is unaccepted memory. This patch (of 2): Support for unaccepted memory was added recently, refer commit dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual machine may need to accept memory before it can be used. Do not let /proc/vmcore try to access unaccepted memory because it can cause the guest to fail. For /proc/vmcore, which is read-only, this means a read or mmap of unaccepted memory will return zeros. Link: https://lkml.kernel.org/r/20230911112114.91323-1-adrian.hunter@intel.com Link: https://lkml.kernel.org/r/20230911112114.91323-2-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dave Young Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Tom Lendacky Cc: Vivek Goyal Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/firmware/efi/unaccepted_memory.c | 20 ++++++++++++++++++++ include/linux/mm.h | 7 +++++++ 2 files changed, 27 insertions(+) diff --git a/drivers/firmware/efi/unaccepted_memory.c b/drivers/firmware/efi/unaccepted_memory.c index 853f7dc3c21d..79ba576b22e3 100644 --- a/drivers/firmware/efi/unaccepted_memory.c +++ b/drivers/firmware/efi/unaccepted_memory.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /* Protects unaccepted memory bitmap */ @@ -145,3 +146,22 @@ bool range_contains_unaccepted_memory(phys_addr_t start, phys_addr_t end) return ret; } + +#ifdef CONFIG_PROC_VMCORE +static bool unaccepted_memory_vmcore_pfn_is_ram(struct vmcore_cb *cb, + unsigned long pfn) +{ + return !pfn_is_unaccepted_memory(pfn); +} + +static struct vmcore_cb vmcore_cb = { + .pfn_is_ram = unaccepted_memory_vmcore_pfn_is_ram, +}; + +static int __init unaccepted_memory_init_kdump(void) +{ + register_vmcore_cb(&vmcore_cb); + return 0; +} +core_initcall(unaccepted_memory_init_kdump); +#endif /* CONFIG_PROC_VMCORE */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 21c3d4e8a282..31dc25d3f6b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4054,4 +4054,11 @@ static inline void accept_memory(phys_addr_t start, phys_addr_t end) #endif +static inline bool pfn_is_unaccepted_memory(unsigned long pfn) +{ + phys_addr_t paddr = pfn << PAGE_SHIFT; + + return range_contains_unaccepted_memory(paddr, paddr + PAGE_SIZE); +} + #endif /* _LINUX_MM_H */ -- cgit v1.2.3 From e538a582097878c536c68002c79722e4a037c080 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Sep 2023 14:21:14 +0300 Subject: proc/kcore: do not try to access unaccepted memory Support for unaccepted memory was added recently, refer commit dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual machine may need to accept memory before it can be used. Do not try to access unaccepted memory because it can cause the guest to fail. For /proc/kcore, which is read-only and does not support mmap, this means a read of unaccepted memory will return zeros. Link: https://lkml.kernel.org/r/20230911112114.91323-3-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Reviewed-by: David Hildenbrand Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dave Young Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Tom Lendacky Cc: Vivek Goyal Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 23fc24d16b31..6422e569b080 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) * and explicitly excluded physical ranges. */ if (!page || PageOffline(page) || - is_page_hwpoison(page) || !pfn_is_ram(pfn)) { + is_page_hwpoison(page) || !pfn_is_ram(pfn) || + pfn_is_unaccepted_memory(pfn)) { if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; -- cgit v1.2.3 From 3ee0aa9f06756e959633ffda37856c6741d948ed Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:14 +0800 Subject: mm: move some shrinker-related function declarations to mm/internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "cleanups for lockless slab shrink", v4. This series is some cleanups for lockless slab shrink. This patch (of 4): The following functions are only used inside the mm subsystem, so it's better to move their declarations to the mm/internal.h file. 1. shrinker_debugfs_add() 2. shrinker_debugfs_detach() 3. shrinker_debugfs_remove() Link: https://lkml.kernel.org/r/20230911092517.64141-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911092517.64141-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Daniel Vetter Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 19 ------------------- mm/internal.h | 26 ++++++++++++++++++++++++++ mm/shrinker_debug.c | 2 ++ 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 224293b2dd06..8dc15aa37410 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -106,28 +106,9 @@ extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG -extern int shrinker_debugfs_add(struct shrinker *shrinker); -extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, - int *debugfs_id); -extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, - int debugfs_id); extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); #else /* CONFIG_SHRINKER_DEBUG */ -static inline int shrinker_debugfs_add(struct shrinker *shrinker) -{ - return 0; -} -static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, - int *debugfs_id) -{ - *debugfs_id = -1; - return NULL; -} -static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, - int debugfs_id) -{ -} static inline __printf(2, 3) int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) { diff --git a/mm/internal.h b/mm/internal.h index 30cf724ddbce..939d1227a527 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1154,4 +1154,30 @@ struct vma_prepare { struct vm_area_struct *remove; struct vm_area_struct *remove2; }; + +/* shrinker related functions */ + +#ifdef CONFIG_SHRINKER_DEBUG +extern int shrinker_debugfs_add(struct shrinker *shrinker); +extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id); +extern void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id); +#else /* CONFIG_SHRINKER_DEBUG */ +static inline int shrinker_debugfs_add(struct shrinker *shrinker) +{ + return 0; +} +static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, + int *debugfs_id) +{ + *debugfs_id = -1; + return NULL; +} +static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry, + int debugfs_id) +{ +} +#endif /* CONFIG_SHRINKER_DEBUG */ + #endif /* __MM_INTERNAL_H */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 3ab53fad8876..ee0cddb4530f 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -6,6 +6,8 @@ #include #include +#include "internal.h" + /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -- cgit v1.2.3 From 96f7b2b9bbb1d680fab1fa96a3958b2d759b881f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:15 +0800 Subject: mm: vmscan: move shrinker-related code into a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mm/vmscan.c file is too large, so separate the shrinker-related code from it into a separate file. No functional changes. Link: https://lkml.kernel.org/r/20230911092517.64141-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/Makefile | 4 +- mm/internal.h | 2 + mm/shrinker.c | 709 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 701 --------------------------------------------------------- 4 files changed, 713 insertions(+), 703 deletions(-) create mode 100644 mm/shrinker.c diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..33873c8aedb3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -48,8 +48,8 @@ endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page-writeback.o folio-compat.o \ - readahead.o swap.o truncate.o vmscan.o shmem.o \ - util.o mmzone.o vmstat.o backing-dev.o \ + readahead.o swap.o truncate.o vmscan.o shrinker.o \ + shmem.o util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ compaction.o show_mem.o shmem_quota.o\ interval_tree.o list_lru.o workingset.o \ diff --git a/mm/internal.h b/mm/internal.h index 939d1227a527..0471d6326d01 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1156,6 +1156,8 @@ struct vma_prepare { }; /* shrinker related functions */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); diff --git a/mm/shrinker.c b/mm/shrinker.c new file mode 100644 index 000000000000..043c87ccfab4 --- /dev/null +++ b/mm/shrinker.c @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#include "internal.h" + +LIST_HEAD(shrinker_list); +DECLARE_RWSEM(shrinker_rwsem); + +#ifdef CONFIG_MEMCG +static int shrinker_nr_max; + +/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ +static inline int shrinker_map_size(int nr_items) +{ + return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); +} + +static inline int shrinker_defer_size(int nr_items) +{ + return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); +} + +void free_shrinker_info(struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *pn; + struct shrinker_info *info; + int nid; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + info = rcu_dereference_protected(pn->shrinker_info, true); + kvfree(info); + rcu_assign_pointer(pn->shrinker_info, NULL); + } +} + +int alloc_shrinker_info(struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + int nid, size, ret = 0; + int map_size, defer_size = 0; + + down_write(&shrinker_rwsem); + map_size = shrinker_map_size(shrinker_nr_max); + defer_size = shrinker_defer_size(shrinker_nr_max); + size = map_size + defer_size; + for_each_node(nid) { + info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); + if (!info) { + free_shrinker_info(memcg); + ret = -ENOMEM; + break; + } + info->nr_deferred = (atomic_long_t *)(info + 1); + info->map = (void *)info->nr_deferred + defer_size; + info->map_nr_max = shrinker_nr_max; + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); + } + up_write(&shrinker_rwsem); + + return ret; +} + +static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); +} + +static int expand_one_shrinker_info(struct mem_cgroup *memcg, + int map_size, int defer_size, + int old_map_size, int old_defer_size, + int new_nr_max) +{ + struct shrinker_info *new, *old; + struct mem_cgroup_per_node *pn; + int nid; + int size = map_size + defer_size; + + for_each_node(nid) { + pn = memcg->nodeinfo[nid]; + old = shrinker_info_protected(memcg, nid); + /* Not yet online memcg */ + if (!old) + return 0; + + /* Already expanded this shrinker_info */ + if (new_nr_max <= old->map_nr_max) + continue; + + new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + if (!new) + return -ENOMEM; + + new->nr_deferred = (atomic_long_t *)(new + 1); + new->map = (void *)new->nr_deferred + defer_size; + new->map_nr_max = new_nr_max; + + /* map: set all old bits, clear all new bits */ + memset(new->map, (int)0xff, old_map_size); + memset((void *)new->map + old_map_size, 0, map_size - old_map_size); + /* nr_deferred: copy old values, clear all new values */ + memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); + memset((void *)new->nr_deferred + old_defer_size, 0, + defer_size - old_defer_size); + + rcu_assign_pointer(pn->shrinker_info, new); + kvfree_rcu(old, rcu); + } + + return 0; +} + +static int expand_shrinker_info(int new_id) +{ + int ret = 0; + int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); + int map_size, defer_size = 0; + int old_map_size, old_defer_size = 0; + struct mem_cgroup *memcg; + + if (!root_mem_cgroup) + goto out; + + lockdep_assert_held(&shrinker_rwsem); + + map_size = shrinker_map_size(new_nr_max); + defer_size = shrinker_defer_size(new_nr_max); + old_map_size = shrinker_map_size(shrinker_nr_max); + old_defer_size = shrinker_defer_size(shrinker_nr_max); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + ret = expand_one_shrinker_info(memcg, map_size, defer_size, + old_map_size, old_defer_size, + new_nr_max); + if (ret) { + mem_cgroup_iter_break(NULL, memcg); + goto out; + } + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); +out: + if (!ret) + shrinker_nr_max = new_nr_max; + + return ret; +} + +void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) +{ + if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { + struct shrinker_info *info; + + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { + /* Pairs with smp mb in shrink_slab() */ + smp_mb__before_atomic(); + set_bit(shrinker_id, info->map); + } + rcu_read_unlock(); + } +} + +static DEFINE_IDR(shrinker_idr); + +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + int id, ret = -ENOMEM; + + if (mem_cgroup_disabled()) + return -ENOSYS; + + down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ + id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); + if (id < 0) + goto unlock; + + if (id >= shrinker_nr_max) { + if (expand_shrinker_info(id)) { + idr_remove(&shrinker_idr, id); + goto unlock; + } + } + shrinker->id = id; + ret = 0; +unlock: + up_write(&shrinker_rwsem); + return ret; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ + int id = shrinker->id; + + BUG_ON(id < 0); + + lockdep_assert_held(&shrinker_rwsem); + + idr_remove(&shrinker_idr, id); +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + struct shrinker_info *info; + + info = shrinker_info_protected(memcg, nid); + return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); +} + +void reparent_shrinker_deferred(struct mem_cgroup *memcg) +{ + int i, nid; + long nr; + struct mem_cgroup *parent; + struct shrinker_info *child_info, *parent_info; + + parent = parent_mem_cgroup(memcg); + if (!parent) + parent = root_mem_cgroup; + + /* Prevent from concurrent shrinker_info expand */ + down_read(&shrinker_rwsem); + for_each_node(nid) { + child_info = shrinker_info_protected(memcg, nid); + parent_info = shrinker_info_protected(parent, nid); + for (i = 0; i < child_info->map_nr_max; i++) { + nr = atomic_long_read(&child_info->nr_deferred[i]); + atomic_long_add(nr, &parent_info->nr_deferred[i]); + } + } + up_read(&shrinker_rwsem); +} +#else +static int prealloc_memcg_shrinker(struct shrinker *shrinker) +{ + return -ENOSYS; +} + +static void unregister_memcg_shrinker(struct shrinker *shrinker) +{ +} + +static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} + +static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, + struct mem_cgroup *memcg) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +static long xchg_nr_deferred(struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return xchg_nr_deferred_memcg(nid, shrinker, + sc->memcg); + + return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); +} + + +static long add_nr_deferred(long nr, struct shrinker *shrinker, + struct shrink_control *sc) +{ + int nid = sc->nid; + + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + nid = 0; + + if (sc->memcg && + (shrinker->flags & SHRINKER_MEMCG_AWARE)) + return add_nr_deferred_memcg(nr, nid, shrinker, + sc->memcg); + + return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); +} + +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, int priority) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0 || freeable == SHRINK_EMPTY) + return freeable; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = xchg_nr_deferred(shrinker, shrinkctl); + + if (shrinker->seeks) { + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); + } else { + /* + * These objects don't require any IO to create. Trim + * them aggressively under memory pressure to keep + * them from causing refetches in the IO caches. + */ + delta = freeable / 2; + } + + total_scan = nr >> priority; + total_scan += delta; + total_scan = min(total_scan, (2 * freeable)); + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + freeable, delta, total_scan, priority); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + shrinkctl->nr_scanned = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); + total_scan -= shrinkctl->nr_scanned; + scanned += shrinkctl->nr_scanned; + + cond_resched(); + } + + /* + * The deferred work is increased by any new work (delta) that wasn't + * done, decreased by old deferred work that was done now. + * + * And it is capped to two times of the freeable items. + */ + next_deferred = max_t(long, (nr + delta - scanned), 0); + next_deferred = min(next_deferred, (2 * freeable)); + + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. + */ + new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); + + trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); + return freed; +} + +#ifdef CONFIG_MEMCG +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + struct shrinker_info *info; + unsigned long ret, freed = 0; + int i; + + if (!mem_cgroup_online(memcg)) + return 0; + + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); + if (unlikely(!info)) + goto unlock; + + for_each_set_bit(i, info->map, info->map_nr_max) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + + shrinker = idr_find(&shrinker_idr, i); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(i, info->map); + continue; + } + + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(i, info->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, i); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } +unlock: + up_read(&shrinker_rwsem); + return freed; +} +#else /* !CONFIG_MEMCG */ +static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, int priority) +{ + return 0; +} +#endif /* CONFIG_MEMCG */ + +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @priority: the reclaim priority + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. Unaware shrinkers + * are called only if it is the root cgroup. + * + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. + * + * Returns the number of reclaimed slab objects. + */ +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority) +{ + unsigned long ret, freed = 0; + struct shrinker *shrinker; + + /* + * The root memcg might be allocated even though memcg is disabled + * via "cgroup_disable=memory" boot parameter. This could make + * mem_cgroup_is_root() return false, then just run memcg slab + * shrink, but skip global shrink. This may result in premature + * oom. + */ + if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) + return shrink_slab_memcg(gfp_mask, nid, memcg, priority); + + if (!down_read_trylock(&shrinker_rwsem)) + goto out; + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } + } + + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} + +/* + * Add a shrinker callback to be called from the vm. + */ +static int __prealloc_shrinker(struct shrinker *shrinker) +{ + unsigned int size; + int err; + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err != -ENOSYS) + return err; + + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + } + + size = sizeof(*shrinker->nr_deferred); + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __prealloc_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + + return err; +} +#else +int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __prealloc_shrinker(shrinker); +} +#endif + +void free_prealloced_shrinker(struct shrinker *shrinker) +{ +#ifdef CONFIG_SHRINKER_DEBUG + kfree_const(shrinker->name); + shrinker->name = NULL; +#endif + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { + down_write(&shrinker_rwsem); + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + return; + } + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} + +void register_shrinker_prepared(struct shrinker *shrinker) +{ + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} + +static int __register_shrinker(struct shrinker *shrinker) +{ + int err = __prealloc_shrinker(shrinker); + + if (err) + return err; + register_shrinker_prepared(shrinker); + return 0; +} + +#ifdef CONFIG_SHRINKER_DEBUG +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + va_list ap; + int err; + + va_start(ap, fmt); + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + va_end(ap); + if (!shrinker->name) + return -ENOMEM; + + err = __register_shrinker(shrinker); + if (err) { + kfree_const(shrinker->name); + shrinker->name = NULL; + } + return err; +} +#else +int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) +{ + return __register_shrinker(shrinker); +} +#endif +EXPORT_SYMBOL(register_shrinker); + +/* + * Remove one + */ +void unregister_shrinker(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry; + int debugfs_id; + + if (!(shrinker->flags & SHRINKER_REGISTERED)) + return; + + down_write(&shrinker_rwsem); + list_del(&shrinker->list); + shrinker->flags &= ~SHRINKER_REGISTERED; + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + up_write(&shrinker_rwsem); + + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; +} +EXPORT_SYMBOL(unregister_shrinker); + +/** + * synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. + */ +void synchronize_shrinkers(void) +{ + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(synchronize_shrinkers); diff --git a/mm/vmscan.c b/mm/vmscan.c index 4062bcca5311..4a6551510b65 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include @@ -188,246 +187,7 @@ struct scan_control { */ int vm_swappiness = 60; -LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); - #ifdef CONFIG_MEMCG -static int shrinker_nr_max; - -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) -{ - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); -} - -static inline int shrinker_defer_size(int nr_items) -{ - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); -} - -static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, - int nid) -{ - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); -} - -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) -{ - struct shrinker_info *new, *old; - struct mem_cgroup_per_node *pn; - int nid; - int size = map_size + defer_size; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - old = shrinker_info_protected(memcg, nid); - /* Not yet online memcg */ - if (!old) - return 0; - - /* Already expanded this shrinker_info */ - if (new_nr_max <= old->map_nr_max) - continue; - - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); - if (!new) - return -ENOMEM; - - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; - new->map_nr_max = new_nr_max; - - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); - - rcu_assign_pointer(pn->shrinker_info, new); - kvfree_rcu(old, rcu); - } - - return 0; -} - -void free_shrinker_info(struct mem_cgroup *memcg) -{ - struct mem_cgroup_per_node *pn; - struct shrinker_info *info; - int nid; - - for_each_node(nid) { - pn = memcg->nodeinfo[nid]; - info = rcu_dereference_protected(pn->shrinker_info, true); - kvfree(info); - rcu_assign_pointer(pn->shrinker_info, NULL); - } -} - -int alloc_shrinker_info(struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; - - down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; - for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; - info->map_nr_max = shrinker_nr_max; - rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); - } - up_write(&shrinker_rwsem); - - return ret; -} - -static int expand_shrinker_info(int new_id) -{ - int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; - struct mem_cgroup *memcg; - - if (!root_mem_cgroup) - goto out; - - lockdep_assert_held(&shrinker_rwsem); - - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); - - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, - new_nr_max); - if (ret) { - mem_cgroup_iter_break(NULL, memcg); - goto out; - } - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); -out: - if (!ret) - shrinker_nr_max = new_nr_max; - - return ret; -} - -void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) -{ - if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { - struct shrinker_info *info; - - rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); - if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { - /* Pairs with smp mb in shrink_slab() */ - smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); - } - rcu_read_unlock(); - } -} - -static DEFINE_IDR(shrinker_idr); - -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - int id, ret = -ENOMEM; - - if (mem_cgroup_disabled()) - return -ENOSYS; - - down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ - id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); - if (id < 0) - goto unlock; - - if (id >= shrinker_nr_max) { - if (expand_shrinker_info(id)) { - idr_remove(&shrinker_idr, id); - goto unlock; - } - } - shrinker->id = id; - ret = 0; -unlock: - up_write(&shrinker_rwsem); - return ret; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ - int id = shrinker->id; - - BUG_ON(id < 0); - - lockdep_assert_held(&shrinker_rwsem); - - idr_remove(&shrinker_idr, id); -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - struct shrinker_info *info; - - info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); -} - -void reparent_shrinker_deferred(struct mem_cgroup *memcg) -{ - int i, nid; - long nr; - struct mem_cgroup *parent; - struct shrinker_info *child_info, *parent_info; - - parent = parent_mem_cgroup(memcg); - if (!parent) - parent = root_mem_cgroup; - - /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); - for_each_node(nid) { - child_info = shrinker_info_protected(memcg, nid); - parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); - } - } - up_read(&shrinker_rwsem); -} /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ static bool cgroup_reclaim(struct scan_control *sc) @@ -468,27 +228,6 @@ static bool writeback_throttling_sane(struct scan_control *sc) return false; } #else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) -{ - return -ENOSYS; -} - -static void unregister_memcg_shrinker(struct shrinker *shrinker) -{ -} - -static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - -static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, - struct mem_cgroup *memcg) -{ - return 0; -} - static bool cgroup_reclaim(struct scan_control *sc) { return false; @@ -557,39 +296,6 @@ static void flush_reclaim_state(struct scan_control *sc) } } -static long xchg_nr_deferred(struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return xchg_nr_deferred_memcg(nid, shrinker, - sc->memcg); - - return atomic_long_xchg(&shrinker->nr_deferred[nid], 0); -} - - -static long add_nr_deferred(long nr, struct shrinker *shrinker, - struct shrink_control *sc) -{ - int nid = sc->nid; - - if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) - nid = 0; - - if (sc->memcg && - (shrinker->flags & SHRINKER_MEMCG_AWARE)) - return add_nr_deferred_memcg(nr, nid, shrinker, - sc->memcg); - - return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]); -} - static bool can_demote(int nid, struct scan_control *sc) { if (!numa_demotion_enabled) @@ -671,413 +377,6 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, return size; } -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); - -#define SHRINK_BATCH 128 - -static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, int priority) -{ - unsigned long freed = 0; - unsigned long long delta; - long total_scan; - long freeable; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; - long scanned = 0, next_deferred; - - freeable = shrinker->count_objects(shrinker, shrinkctl); - if (freeable == 0 || freeable == SHRINK_EMPTY) - return freeable; - - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = xchg_nr_deferred(shrinker, shrinkctl); - - if (shrinker->seeks) { - delta = freeable >> priority; - delta *= 4; - do_div(delta, shrinker->seeks); - } else { - /* - * These objects don't require any IO to create. Trim - * them aggressively under memory pressure to keep - * them from causing refetches in the IO caches. - */ - delta = freeable / 2; - } - - total_scan = nr >> priority; - total_scan += delta; - total_scan = min(total_scan, (2 * freeable)); - - trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - freeable, delta, total_scan, priority); - - /* - * Normally, we should not scan less than batch_size objects in one - * pass to avoid too frequent shrinker calls, but if the slab has less - * than batch_size objects in total and we are really tight on memory, - * we will try to reclaim all available objects, otherwise we can end - * up failing allocations although there are plenty of reclaimable - * objects spread over several slabs with usage less than the - * batch_size. - * - * We detect the "tight on memory" situations by looking at the total - * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (freeable), we must be - * scanning at high prio and therefore should try to reclaim as much as - * possible. - */ - while (total_scan >= batch_size || - total_scan >= freeable) { - unsigned long ret; - unsigned long nr_to_scan = min(batch_size, total_scan); - - shrinkctl->nr_to_scan = nr_to_scan; - shrinkctl->nr_scanned = nr_to_scan; - ret = shrinker->scan_objects(shrinker, shrinkctl); - if (ret == SHRINK_STOP) - break; - freed += ret; - - count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); - total_scan -= shrinkctl->nr_scanned; - scanned += shrinkctl->nr_scanned; - - cond_resched(); - } - - /* - * The deferred work is increased by any new work (delta) that wasn't - * done, decreased by old deferred work that was done now. - * - * And it is capped to two times of the freeable items. - */ - next_deferred = max_t(long, (nr + delta - scanned), 0); - next_deferred = min(next_deferred, (2 * freeable)); - - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. - */ - new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl); - - trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan); - return freed; -} - -#ifdef CONFIG_MEMCG -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - struct shrinker_info *info; - unsigned long ret, freed = 0; - int i; - - if (!mem_cgroup_online(memcg)) - return 0; - - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); - if (unlikely(!info)) - goto unlock; - - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; - - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } - - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } -unlock: - up_read(&shrinker_rwsem); - return freed; -} -#else /* CONFIG_MEMCG */ -static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, int priority) -{ - return 0; -} -#endif /* CONFIG_MEMCG */ - -/** - * shrink_slab - shrink slab caches - * @gfp_mask: allocation context - * @nid: node whose slab caches to target - * @memcg: memory cgroup whose slab caches to target - * @priority: the reclaim priority - * - * Call the shrink functions to age shrinkable caches. - * - * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, - * unaware shrinkers will receive a node id of 0 instead. - * - * @memcg specifies the memory cgroup to target. Unaware shrinkers - * are called only if it is the root cgroup. - * - * @priority is sc->priority, we take the number of objects and >> by priority - * in order to get the scan target. - * - * Returns the number of reclaimed slab objects. - */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, - struct mem_cgroup *memcg, - int priority) -{ - unsigned long ret, freed = 0; - struct shrinker *shrinker; - - /* - * The root memcg might be allocated even though memcg is disabled - * via "cgroup_disable=memory" boot parameter. This could make - * mem_cgroup_is_root() return false, then just run memcg slab - * shrink, but skip global shrink. This may result in premature - * oom. - */ - if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) - return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } - } - - up_read(&shrinker_rwsem); -out: - cond_resched(); - return freed; -} - static unsigned long drop_slab_node(int nid) { unsigned long freed = 0; -- cgit v1.2.3 From 1dd49e58f966b1eecd935dc28458a8369ae94ad1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:16 +0800 Subject: mm: shrinker: remove redundant shrinker_rwsem in debugfs operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit debugfs_remove_recursive() will wait for debugfs_file_put() to return, so the shrinker will not be freed when doing debugfs operations (such as shrinker_debugfs_count_show() and shrinker_debugfs_scan_write()), so there is no need to hold shrinker_rwsem during debugfs operations. Link: https://lkml.kernel.org/r/20230911092517.64141-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Christian König Cc: Chuck Lever Cc: Daniel Vetter Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index ee0cddb4530f..e4ce509f619e 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -51,17 +51,12 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret, nid; + int ret = 0, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - kfree(count_per_node); - return ret; - } rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -94,7 +89,6 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); rcu_read_unlock(); - up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -119,7 +113,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, struct mem_cgroup *memcg = NULL; int nid; char kbuf[72]; - ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -148,12 +141,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - ret = down_read_killable(&shrinker_rwsem); - if (ret) { - mem_cgroup_put(memcg); - return ret; - } - sc.nid = nid; sc.memcg = memcg; sc.nr_to_scan = nr_to_scan; @@ -161,7 +148,6 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; -- cgit v1.2.3 From 0b2f5ea1aa39c0ed34bdadb53faf519e3d84ac4a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:25:17 +0800 Subject: drm/ttm: introduce pool_shrink_rwsem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, synchronize_shrinkers() is only used by TTM pool. It only requires that no shrinkers run in parallel. After we use RCU+refcount method to implement the lockless slab shrink, we can not use shrinker_rwsem or synchronize_rcu() to guarantee that all shrinker invocations have seen an update before freeing memory. So we introduce a new pool_shrink_rwsem to implement a private ttm_pool_synchronize_shrinkers(), so as to achieve the same purpose. Link: https://lkml.kernel.org/r/20230911092517.64141-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Christian König Acked-by: Daniel Vetter Cc: Christian Brauner Cc: Chuck Lever Cc: Daniel Vetter Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Joel Fernandes Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Coly Li Cc: Dai Ngo Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/gpu/drm/ttm/ttm_pool.c | 17 ++++++++++++++++- include/linux/shrinker.h | 1 - mm/shrinker.c | 15 --------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index cddb9151d20f..648ca70403a7 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -74,6 +74,7 @@ static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; static struct shrinker mm_shrinker; +static DECLARE_RWSEM(pool_shrink_rwsem); /* Allocate pages of size 1 << order with the given gfp_flags */ static struct page *ttm_pool_alloc_page(struct ttm_pool *pool, gfp_t gfp_flags, @@ -317,6 +318,7 @@ static unsigned int ttm_pool_shrink(void) unsigned int num_pages; struct page *p; + down_read(&pool_shrink_rwsem); spin_lock(&shrinker_lock); pt = list_first_entry(&shrinker_list, typeof(*pt), shrinker_list); list_move_tail(&pt->shrinker_list, &shrinker_list); @@ -329,6 +331,7 @@ static unsigned int ttm_pool_shrink(void) } else { num_pages = 0; } + up_read(&pool_shrink_rwsem); return num_pages; } @@ -572,6 +575,18 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev, } EXPORT_SYMBOL(ttm_pool_init); +/** + * ttm_pool_synchronize_shrinkers - Wait for all running shrinkers to complete. + * + * This is useful to guarantee that all shrinker invocations have seen an + * update, before freeing memory, similar to rcu. + */ +static void ttm_pool_synchronize_shrinkers(void) +{ + down_write(&pool_shrink_rwsem); + up_write(&pool_shrink_rwsem); +} + /** * ttm_pool_fini - Cleanup a pool * @@ -593,7 +608,7 @@ void ttm_pool_fini(struct ttm_pool *pool) /* We removed the pool types from the LRU, but we need to also make sure * that no shrinker is concurrently freeing pages from the pool. */ - synchronize_shrinkers(); + ttm_pool_synchronize_shrinkers(); } EXPORT_SYMBOL(ttm_pool_fini); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 8dc15aa37410..6b5843c3b827 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -103,7 +103,6 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, const char *fmt, ...); extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); -extern void synchronize_shrinkers(void); #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, diff --git a/mm/shrinker.c b/mm/shrinker.c index 043c87ccfab4..a16cd448b924 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -692,18 +692,3 @@ void unregister_shrinker(struct shrinker *shrinker) shrinker->nr_deferred = NULL; } EXPORT_SYMBOL(unregister_shrinker); - -/** - * synchronize_shrinkers - Wait for all running shrinkers to complete. - * - * This is equivalent to calling unregister_shrink() and register_shrinker(), - * but atomically and with less overhead. This is useful to guarantee that all - * shrinker invocations have seen an update, before freeing memory, similar to - * rcu. - */ -void synchronize_shrinkers(void) -{ - down_write(&shrinker_rwsem); - up_write(&shrinker_rwsem); -} -EXPORT_SYMBOL(synchronize_shrinkers); -- cgit v1.2.3 From c42d50aefd17a6bad3ed617769edbbb579137545 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:00 +0800 Subject: mm: shrinker: add infrastructure for dynamically allocating shrinker Patch series "use refcount+RCU method to implement lockless slab shrink", v6. 1. Background ============= We used to implement the lockless slab shrink with SRCU [1], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [2], so we reverted it [3]. This patch series aims to re-implement the lockless slab shrink using the refcount+RCU method proposed by Dave Chinner [4]. [1]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [2]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [3]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [4]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ 2. Implementation ================= Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). This patchset adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() 2. shrinker_register() 3. shrinker_free() In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, this patchset uses the above APIs to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using uncoverted shrinkers in the new setup. ``` Then we free the shrinker by calling call_rcu(), and use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker::refcount mechanism ensures that the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker() to wait RCU read-side critical section. PATCH 1: introduce new APIs PATCH 2~38: convert all shrinnkers to use new APIs PATCH 39: remove old APIs PATCH 40~41: some cleanups and preparations PATCH 42-43: implement the lockless slab shrink PATCH 44~45: convert shrinker_rwsem to mutex 3. Testing ========== 3.1 slab shrink stress test --------------------------- We can reproduce the down_read_trylock() hotspot through the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. 3.2 registration and unregistration stress test ----------------------------------------------- Run the command below to test: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. This patch (of 45): Currently, the shrinker instances can be divided into the following three types: a) global shrinker instance statically defined in the kernel, such as workingset_shadow_shrinker. b) global shrinker instance statically defined in the kernel modules, such as mmu_shrinker in x86. c) shrinker instance embedded in other structures. For case a, the memory of shrinker instance is never freed. For case b, the memory of shrinker instance will be freed after synchronize_rcu() when the module is unloaded. For case c, the memory of shrinker instance will be freed along with the structure it is embedded in. In preparation for implementing lockless slab shrink, we need to dynamically allocate those shrinker instances in case c, then the memory can be dynamically freed alone by calling kfree_rcu(). So this commit adds the following new APIs for dynamically allocating shrinker, and add a private_data field to struct shrinker to record and get the original embedded structure. 1. shrinker_alloc() Used to allocate shrinker instance itself and related memory, it will return a pointer to the shrinker instance on success and NULL on failure. 2. shrinker_register() Used to register the shrinker instance, which is same as the current register_shrinker_prepared(). 3. shrinker_free() Used to unregister (if needed) and free the shrinker instance. In order to simplify shrinker-related APIs and make shrinker more independent of other kernel mechanisms, subsequent submissions will use the above API to convert all shrinkers (including case a and b) to dynamically allocated, and then remove all existing APIs. This will also have another advantage mentioned by Dave Chinner: ``` The other advantage of this is that it will break all the existing out of tree code and third party modules using the old API and will no longer work with a kernel using lockless slab shrinkers. They need to break (both at the source and binary levels) to stop bad things from happening due to using unconverted shrinkers in the new setup. ``` [zhengqi.arch@bytedance.com: mm: shrinker: some cleanup] Link: https://lkml.kernel.org/r/20230919024607.65463-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Greg Kroah-Hartman Cc: Kirill Tkhai Cc: Paul E. McKenney Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Steven Price Cc: Theodore Ts'o Cc: Vlastimil Babka Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 19 ++++++--- mm/internal.h | 22 ++++++++++ mm/shrinker.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++ mm/shrinker_debug.c | 3 -- 4 files changed, 142 insertions(+), 8 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 6b5843c3b827..f4a5249f00b2 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -70,6 +70,8 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + void *private_data; + /* These are for internal use */ struct list_head list; #ifdef CONFIG_MEMCG @@ -86,15 +88,22 @@ struct shrinker { }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ -/* Flags */ -#define SHRINKER_REGISTERED (1 << 0) -#define SHRINKER_NUMA_AWARE (1 << 1) -#define SHRINKER_MEMCG_AWARE (1 << 2) +/* Internal flags */ +#define SHRINKER_REGISTERED BIT(0) +#define SHRINKER_ALLOCATED BIT(1) + +/* Flags for users to use */ +#define SHRINKER_NUMA_AWARE BIT(2) +#define SHRINKER_MEMCG_AWARE BIT(3) /* * It just makes sense when the shrinker is also MEMCG_AWARE for now, * non-MEMCG_AWARE shrinker should not have this flag set. */ -#define SHRINKER_NONSLAB (1 << 3) +#define SHRINKER_NONSLAB BIT(4) + +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); +void shrinker_register(struct shrinker *shrinker); +void shrinker_free(struct shrinker *shrinker); extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/internal.h b/mm/internal.h index 0471d6326d01..a273f4d948d8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1160,6 +1160,20 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); + + return shrinker->name ? 0 : -ENOMEM; +} + +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ + kfree_const(shrinker->name); + shrinker->name = NULL; +} + extern int shrinker_debugfs_add(struct shrinker *shrinker); extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id); @@ -1170,6 +1184,14 @@ static inline int shrinker_debugfs_add(struct shrinker *shrinker) { return 0; } +static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, + const char *fmt, va_list ap) +{ + return 0; +} +static inline void shrinker_debugfs_name_free(struct shrinker *shrinker) +{ +} static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, int *debugfs_id) { diff --git a/mm/shrinker.c b/mm/shrinker.c index a16cd448b924..d1032a4d5684 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -550,6 +550,112 @@ out: return freed; } +struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) +{ + struct shrinker *shrinker; + unsigned int size; + va_list ap; + int err; + + shrinker = kzalloc(sizeof(struct shrinker), GFP_KERNEL); + if (!shrinker) + return NULL; + + va_start(ap, fmt); + err = shrinker_debugfs_name_alloc(shrinker, fmt, ap); + va_end(ap); + if (err) + goto err_name; + + shrinker->flags = flags | SHRINKER_ALLOCATED; + shrinker->seeks = DEFAULT_SEEKS; + + if (flags & SHRINKER_MEMCG_AWARE) { + err = prealloc_memcg_shrinker(shrinker); + if (err == -ENOSYS) { + /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ + shrinker->flags &= ~SHRINKER_MEMCG_AWARE; + goto non_memcg; + } + + if (err) + goto err_flags; + + return shrinker; + } + +non_memcg: + /* + * The nr_deferred is available on per memcg level for memcg aware + * shrinkers, so only allocate nr_deferred in the following cases: + * - non-memcg-aware shrinkers + * - !CONFIG_MEMCG + * - memcg is disabled by kernel command line + */ + size = sizeof(*shrinker->nr_deferred); + if (flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + goto err_flags; + + return shrinker; + +err_flags: + shrinker_debugfs_name_free(shrinker); +err_name: + kfree(shrinker); + return NULL; +} +EXPORT_SYMBOL_GPL(shrinker_alloc); + +void shrinker_register(struct shrinker *shrinker) +{ + if (unlikely(!(shrinker->flags & SHRINKER_ALLOCATED))) { + pr_warn("Must use shrinker_alloc() to dynamically allocate the shrinker"); + return; + } + + down_write(&shrinker_rwsem); + list_add_tail(&shrinker->list, &shrinker_list); + shrinker->flags |= SHRINKER_REGISTERED; + shrinker_debugfs_add(shrinker); + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL_GPL(shrinker_register); + +void shrinker_free(struct shrinker *shrinker) +{ + struct dentry *debugfs_entry = NULL; + int debugfs_id; + + if (!shrinker) + return; + + down_write(&shrinker_rwsem); + if (shrinker->flags & SHRINKER_REGISTERED) { + list_del(&shrinker->list); + debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); + shrinker->flags &= ~SHRINKER_REGISTERED; + } + + shrinker_debugfs_name_free(shrinker); + + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + unregister_memcg_shrinker(shrinker); + up_write(&shrinker_rwsem); + + if (debugfs_entry) + shrinker_debugfs_remove(debugfs_entry, debugfs_id); + + kfree(shrinker->nr_deferred); + shrinker->nr_deferred = NULL; + + kfree(shrinker); +} +EXPORT_SYMBOL_GPL(shrinker_free); + /* * Add a shrinker callback to be called from the vm. */ diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index e4ce509f619e..24aebe7c24cc 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -241,9 +241,6 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, lockdep_assert_held(&shrinker_rwsem); - kfree_const(shrinker->name); - shrinker->name = NULL; - *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; -- cgit v1.2.3 From e5985c40987640717e3985e2ae02bf707c12f74a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:01 +0800 Subject: kvm: mmu: dynamically allocate the x86-mmu shrinker Use new APIs to dynamically allocate the x86-mmu shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- arch/x86/kvm/mmu/mmu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7901cb4d2fa..077d01d931de 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -6785,11 +6785,7 @@ static unsigned long mmu_shrink_count(struct shrinker *shrink, return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } -static struct shrinker mmu_shrinker = { - .count_objects = mmu_shrink_count, - .scan_objects = mmu_shrink_scan, - .seeks = DEFAULT_SEEKS * 10, -}; +static struct shrinker *mmu_shrinker; static void mmu_destroy_caches(void) { @@ -6922,10 +6918,16 @@ int kvm_mmu_vendor_module_init(void) if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) goto out; - ret = register_shrinker(&mmu_shrinker, "x86-mmu"); - if (ret) + mmu_shrinker = shrinker_alloc(0, "x86-mmu"); + if (!mmu_shrinker) goto out_shrinker; + mmu_shrinker->count_objects = mmu_shrink_count; + mmu_shrinker->scan_objects = mmu_shrink_scan; + mmu_shrinker->seeks = DEFAULT_SEEKS * 10; + + shrinker_register(mmu_shrinker); + return 0; out_shrinker: @@ -6947,7 +6949,7 @@ void kvm_mmu_vendor_module_exit(void) { mmu_destroy_caches(); percpu_counter_destroy(&kvm_total_used_mmu_pages); - unregister_shrinker(&mmu_shrinker); + shrinker_free(mmu_shrinker); } /* -- cgit v1.2.3 From 95a542da5322ef2db193a546d2d888cf4dbaa219 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:02 +0800 Subject: binder: dynamically allocate the android-binder shrinker Use new APIs to dynamically allocate the android-binder shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Carlos Llamas Cc: Greg Kroah-Hartman Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index e3db8297095a..138f6d43d13b 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1053,11 +1053,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) NULL, sc->nr_to_scan); } -static struct shrinker binder_shrinker = { - .count_objects = binder_shrink_count, - .scan_objects = binder_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *binder_shrinker; /** * binder_alloc_init() - called by binder_open() for per-proc initialization @@ -1077,19 +1073,29 @@ void binder_alloc_init(struct binder_alloc *alloc) int binder_alloc_shrinker_init(void) { - int ret = list_lru_init(&binder_alloc_lru); + int ret; - if (ret == 0) { - ret = register_shrinker(&binder_shrinker, "android-binder"); - if (ret) - list_lru_destroy(&binder_alloc_lru); + ret = list_lru_init(&binder_alloc_lru); + if (ret) + return ret; + + binder_shrinker = shrinker_alloc(0, "android-binder"); + if (!binder_shrinker) { + list_lru_destroy(&binder_alloc_lru); + return -ENOMEM; } - return ret; + + binder_shrinker->count_objects = binder_shrink_count; + binder_shrinker->scan_objects = binder_shrink_scan; + + shrinker_register(binder_shrinker); + + return 0; } void binder_alloc_shrinker_exit(void) { - unregister_shrinker(&binder_shrinker); + shrinker_free(binder_shrinker); list_lru_destroy(&binder_alloc_lru); } -- cgit v1.2.3 From d35b5c98c1f1ea44d4652a78163ce6d0e6ec2b78 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:03 +0800 Subject: drm/ttm: dynamically allocate the drm-ttm_pool shrinker Use new APIs to dynamically allocate the drm-ttm_pool shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Christian Koenig Cc: Huang Rui Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/gpu/drm/ttm/ttm_pool.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 648ca70403a7..fe610a3cace0 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -73,7 +73,7 @@ static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; static spinlock_t shrinker_lock; static struct list_head shrinker_list; -static struct shrinker mm_shrinker; +static struct shrinker *mm_shrinker; static DECLARE_RWSEM(pool_shrink_rwsem); /* Allocate pages of size 1 << order with the given gfp_flags */ @@ -749,8 +749,8 @@ static int ttm_pool_debugfs_shrink_show(struct seq_file *m, void *data) struct shrink_control sc = { .gfp_mask = GFP_NOFS }; fs_reclaim_acquire(GFP_KERNEL); - seq_printf(m, "%lu/%lu\n", ttm_pool_shrinker_count(&mm_shrinker, &sc), - ttm_pool_shrinker_scan(&mm_shrinker, &sc)); + seq_printf(m, "%lu/%lu\n", ttm_pool_shrinker_count(mm_shrinker, &sc), + ttm_pool_shrinker_scan(mm_shrinker, &sc)); fs_reclaim_release(GFP_KERNEL); return 0; @@ -794,10 +794,17 @@ int ttm_pool_mgr_init(unsigned long num_pages) &ttm_pool_debugfs_shrink_fops); #endif - mm_shrinker.count_objects = ttm_pool_shrinker_count; - mm_shrinker.scan_objects = ttm_pool_shrinker_scan; - mm_shrinker.seeks = 1; - return register_shrinker(&mm_shrinker, "drm-ttm_pool"); + mm_shrinker = shrinker_alloc(0, "drm-ttm_pool"); + if (!mm_shrinker) + return -ENOMEM; + + mm_shrinker->count_objects = ttm_pool_shrinker_count; + mm_shrinker->scan_objects = ttm_pool_shrinker_scan; + mm_shrinker->seeks = 1; + + shrinker_register(mm_shrinker); + + return 0; } /** @@ -817,6 +824,6 @@ void ttm_pool_mgr_fini(void) ttm_pool_type_fini(&global_dma32_uncached[i]); } - unregister_shrinker(&mm_shrinker); + shrinker_free(mm_shrinker); WARN_ON(!list_empty(&shrinker_list)); } -- cgit v1.2.3 From 1ec016bfa10f2873c125018c4efcf09c141645ff Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:04 +0800 Subject: xenbus/backend: dynamically allocate the xen-backend shrinker Use new APIs to dynamically allocate the xen-backend shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-6-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Juergen Gross Cc: Stefano Stabellini Cc: Oleksandr Tyshchenko Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/xen/xenbus/xenbus_probe_backend.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index da96c260e26b..5ebb7233076f 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -284,13 +284,9 @@ static unsigned long backend_shrink_memory_count(struct shrinker *shrinker, return 0; } -static struct shrinker backend_memory_shrinker = { - .count_objects = backend_shrink_memory_count, - .seeks = DEFAULT_SEEKS, -}; - static int __init xenbus_probe_backend_init(void) { + struct shrinker *backend_memory_shrinker; static struct notifier_block xenstore_notifier = { .notifier_call = backend_probe_and_watch }; @@ -305,8 +301,15 @@ static int __init xenbus_probe_backend_init(void) register_xenstore_notifier(&xenstore_notifier); - if (register_shrinker(&backend_memory_shrinker, "xen-backend")) - pr_warn("shrinker registration failed\n"); + backend_memory_shrinker = shrinker_alloc(0, "xen-backend"); + if (!backend_memory_shrinker) { + pr_warn("shrinker allocation failed\n"); + return 0; + } + + backend_memory_shrinker->count_objects = backend_shrink_memory_count; + + shrinker_register(backend_memory_shrinker); return 0; } -- cgit v1.2.3 From 557936ee8dc6bd0d5e078f415441d9f2a3fcca23 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:05 +0800 Subject: erofs: dynamically allocate the erofs-shrinker Use new APIs to dynamically allocate the erofs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Reviewed-by: Gao Xiang Cc: Yue Hu Cc: Jeffle Xu Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- fs/erofs/utils.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index cc6fb9e98899..e9c25cd7b601 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -270,19 +270,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, return freed; } -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *erofs_shrinker_info; int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + + shrinker_register(erofs_shrinker_info); + + return 0; } void erofs_exit_shrinker(void) { - unregister_shrinker(&erofs_shrinker_info); + shrinker_free(erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ -- cgit v1.2.3 From bfcba5ba39cbce8957b4908c9dd570dd6097fff3 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:06 +0800 Subject: f2fs: dynamically allocate the f2fs-shrinker Use new APIs to dynamically allocate the f2fs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Cc: Jaegeuk Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/f2fs/super.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a8c8232852bb..fe25ff9cebbe 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #endif /* f2fs-wide shrinker description */ -static struct shrinker f2fs_shrinker_info = { - .scan_objects = f2fs_shrink_scan, - .count_objects = f2fs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *f2fs_shrinker_info; + +static int __init f2fs_init_shrinker(void) +{ + f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); + if (!f2fs_shrinker_info) + return -ENOMEM; + + f2fs_shrinker_info->count_objects = f2fs_shrink_count; + f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; + + shrinker_register(f2fs_shrinker_info); + + return 0; +} + +static void f2fs_exit_shrinker(void) +{ + shrinker_free(f2fs_shrinker_info); +} enum { Opt_gc_background, @@ -4944,7 +4959,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); + err = f2fs_init_shrinker(); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); @@ -4989,7 +5004,7 @@ free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); free_shrinker: - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); free_garbage_collection_cache: @@ -5021,7 +5036,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); -- cgit v1.2.3 From a304c23cd6c96ecdf5f96df66e809993d5e96bf0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:07 +0800 Subject: gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/gfs2/glock.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4a280be229a6..482291bb08d8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2041,11 +2041,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(atomic_read(&lru_count)); } -static struct shrinker glock_shrinker = { - .seeks = DEFAULT_SEEKS, - .count_objects = gfs2_glock_shrink_count, - .scan_objects = gfs2_glock_shrink_scan, -}; +static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket @@ -2465,13 +2461,18 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker, "gfs2-glock"); - if (ret) { + glock_shrinker = shrinker_alloc(0, "gfs2-glock"); + if (!glock_shrinker) { destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); - return ret; + return -ENOMEM; } + glock_shrinker->count_objects = gfs2_glock_shrink_count; + glock_shrinker->scan_objects = gfs2_glock_shrink_scan; + + shrinker_register(glock_shrinker); + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); @@ -2480,7 +2481,7 @@ int __init gfs2_glock_init(void) void gfs2_glock_exit(void) { - unregister_shrinker(&glock_shrinker); + shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); } -- cgit v1.2.3 From 8ee0fd9c1085d952ea62c1879620eaa01e4c6332 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:08 +0800 Subject: gfs2: dynamically allocate the gfs2-qd shrinker Use new APIs to dynamically allocate the gfs2-qd shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-10-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/gfs2/main.c | 6 +++--- fs/gfs2/quota.c | 25 +++++++++++++++++++------ fs/gfs2/quota.h | 3 ++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 66eb98b690a2..79be0cdc730c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + error = gfs2_qd_shrinker_init(); if (error) goto fail_shrinker; @@ -196,7 +196,7 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs2_recovery_wq); fail_wq1: - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); fail_cachep8: @@ -229,7 +229,7 @@ fail_lru: static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 171b2713d2e5..d3d013d1d5ac 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -196,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } -struct shrinker gfs2_qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, -}; +static struct shrinker *gfs2_qd_shrinker; + +int __init gfs2_qd_shrinker_init(void) +{ + gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); + if (!gfs2_qd_shrinker) + return -ENOMEM; + + gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; + gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; + + shrinker_register(gfs2_qd_shrinker); + return 0; +} + +void gfs2_qd_shrinker_exit(void) +{ + shrinker_free(gfs2_qd_shrinker); +} static u64 qd2index(struct gfs2_quota_data *qd) { diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 1429945215a0..36f54b426b0c 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -60,7 +60,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, } extern const struct quotactl_ops gfs2_quotactl_ops; -extern struct shrinker gfs2_qd_shrinker; +int __init gfs2_qd_shrinker_init(void); +void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; extern void __init gfs2_quota_hash_init(void); -- cgit v1.2.3 From d5dad4929fb42e7f610cda5e1ea51cd76be998d4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:09 +0800 Subject: NFSv4.2: dynamically allocate the nfs-xattr shrinkers Use new APIs to dynamically allocate the nfs-xattr shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-11-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfs/nfs42xattr.c | 87 +++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 911f634ba3da..2ad66a8922f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfs4_xattr_cache_shrinker = { - .count_objects = nfs4_xattr_cache_count, - .scan_objects = nfs4_xattr_cache_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = DEFAULT_SEEKS, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_large_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = 1, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, @@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) struct nfs4_xattr_entry *entry; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); @@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); @@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } -static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, - struct list_lru *lru, const char *name) +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) { - int ret = 0; + int ret; - ret = register_shrinker(shrinker, name); - if (ret) + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); return ret; + } - ret = list_lru_init_memcg(lru, shrinker); - if (ret) - unregister_shrinker(shrinker); + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); return ret; } @@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { - unregister_shrinker(shrinker); + shrinker_free(shrinker); list_lru_destroy(lru); } @@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, - &nfs4_xattr_cache_lru, - "nfs-xattr_cache"); + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, - &nfs4_xattr_entry_lru, - "nfs-xattr_entry"); + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, - "nfs-xattr_large_entry"); + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); @@ -1056,11 +1057,11 @@ out1: void nfs4_xattr_cache_exit(void) { - nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } -- cgit v1.2.3 From 777fc8f1b4b9a0c5a558f1ab49ddfa5137bfaeb3 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:10 +0800 Subject: nfs: dynamically allocate the nfs-acl shrinker Use new APIs to dynamically allocate the nfs-acl shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-12-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfs/super.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d6473cb00cb..09ded7f63acf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -153,9 +149,18 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker, "nfs-acl"); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + #ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); #endif @@ -175,7 +180,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); #ifdef CONFIG_NFS_V4_2 -- cgit v1.2.3 From 856e594965414a1cc63d0f3e709ecb6869bc685e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:11 +0800 Subject: nfsd: dynamically allocate the nfsd-filecache shrinker Use new APIs to dynamically allocate the nfsd-filecache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-13-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Jeff Layton Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/filecache.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ee9c923192e0..9c62b4502539 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) return ret; } -static struct shrinker nfsd_file_shrinker = { - .scan_objects = nfsd_file_lru_scan, - .count_objects = nfsd_file_lru_count, - .seeks = 1, -}; +static struct shrinker *nfsd_file_shrinker; /** * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file @@ -746,12 +742,19 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); - if (ret) { - pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache"); + if (!nfsd_file_shrinker) { + ret = -ENOMEM; + pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); goto out_lru; } + nfsd_file_shrinker->count_objects = nfsd_file_lru_count; + nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan; + nfsd_file_shrinker->seeks = 1; + + shrinker_register(nfsd_file_shrinker); + ret = lease_register_notifier(&nfsd_file_lease_notifier); if (ret) { pr_err("nfsd: unable to register lease notifier: %d\n", ret); @@ -774,7 +777,7 @@ out: out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); out_shrinker: - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); out_lru: list_lru_destroy(&nfsd_file_lru); out_err: @@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void) return; lease_unregister_notifier(&nfsd_file_lease_notifier); - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); /* * make sure all callers of nfsd_file_lru_cb are done before * calling nfsd_file_cache_purge -- cgit v1.2.3 From eab477e883b52a82bd7b4437111bbc6264694349 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:12 +0800 Subject: quota: dynamically allocate the dquota-cache shrinker Use new APIs to dynamically allocate the dquota-cache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-14-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/quota/dquot.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9e72bfe8bbad..15030b0cd1c8 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -791,12 +791,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } -static struct shrinker dqcache_shrinker = { - .count_objects = dqcache_shrink_count, - .scan_objects = dqcache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - /* * Safely release dquot and put reference to dquot. */ @@ -2956,6 +2950,7 @@ static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; + struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -2990,8 +2985,14 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker, "dquota-cache")) - panic("Cannot register dquot shrinker"); + dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); + if (!dqcache_shrinker) + panic("Cannot allocate dquot shrinker"); + + dqcache_shrinker->count_objects = dqcache_shrink_count; + dqcache_shrinker->scan_objects = dqcache_shrink_scan; + + shrinker_register(dqcache_shrinker); return 0; } -- cgit v1.2.3 From 827a34f90e5a27b83f188b914512d81e36688dec Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:13 +0800 Subject: ubifs: dynamically allocate the ubifs-slab shrinker Use new APIs to dynamically allocate the ubifs-slab shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-15-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Richard Weinberger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/ubifs/super.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b08fb28d16b5..96f6a9118207 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ -static struct shrinker ubifs_shrinker_info = { - .scan_objects = ubifs_shrink_scan, - .count_objects = ubifs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. @@ -2373,7 +2369,7 @@ static void inode_slab_ctor(void *obj) static int __init ubifs_init(void) { - int err; + int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); @@ -2439,10 +2435,15 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); - if (err) + ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); + if (!ubifs_shrinker_info) goto out_slab; + ubifs_shrinker_info->count_objects = ubifs_shrink_count; + ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; + + shrinker_register(ubifs_shrinker_info); + err = ubifs_compressors_init(); if (err) goto out_shrinker; @@ -2467,7 +2468,7 @@ out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; @@ -2483,7 +2484,7 @@ static void __exit ubifs_exit(void) dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we -- cgit v1.2.3 From 2fbacff0cbf58530fa459c3f28ba9125df33ad86 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:14 +0800 Subject: rcu: dynamically allocate the rcu-lazy shrinker Use new APIs to dynamically allocate the rcu-lazy shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-16-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- kernel/rcu/tree_nocb.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 5598212d1f27..4efbf7333d4e 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -1396,13 +1396,6 @@ lazy_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return count ? count : SHRINK_STOP; } - -static struct shrinker lazy_rcu_shrinker = { - .count_objects = lazy_rcu_shrink_count, - .scan_objects = lazy_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; #endif // #ifdef CONFIG_RCU_LAZY void __init rcu_init_nohz(void) @@ -1410,6 +1403,7 @@ void __init rcu_init_nohz(void) int cpu; struct rcu_data *rdp; const struct cpumask *cpumask = NULL; + struct shrinker * __maybe_unused lazy_rcu_shrinker; #if defined(CONFIG_NO_HZ_FULL) if (tick_nohz_full_running && !cpumask_empty(tick_nohz_full_mask)) @@ -1436,8 +1430,15 @@ void __init rcu_init_nohz(void) return; #ifdef CONFIG_RCU_LAZY - if (register_shrinker(&lazy_rcu_shrinker, "rcu-lazy")) - pr_err("Failed to register lazy_rcu shrinker!\n"); + lazy_rcu_shrinker = shrinker_alloc(0, "rcu-lazy"); + if (!lazy_rcu_shrinker) { + pr_err("Failed to allocate lazy_rcu shrinker!\n"); + } else { + lazy_rcu_shrinker->count_objects = lazy_rcu_shrink_count; + lazy_rcu_shrinker->scan_objects = lazy_rcu_shrink_scan; + + shrinker_register(lazy_rcu_shrinker); + } #endif // #ifdef CONFIG_RCU_LAZY if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { -- cgit v1.2.3 From 21e0b932fb5d72bb6cefdf56e22bfddea968cf32 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:15 +0800 Subject: rcu: dynamically allocate the rcu-kfree shrinker Use new APIs to dynamically allocate the rcu-kfree shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-17-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Joel Fernandes (Google) Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- kernel/rcu/tree.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cb1caefa8bd0..06e2ed495c02 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3449,13 +3449,6 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) return freed == 0 ? SHRINK_STOP : freed; } -static struct shrinker kfree_rcu_shrinker = { - .count_objects = kfree_rcu_shrink_count, - .scan_objects = kfree_rcu_shrink_scan, - .batch = 0, - .seeks = DEFAULT_SEEKS, -}; - void __init kfree_rcu_scheduler_running(void) { int cpu; @@ -4931,6 +4924,7 @@ static void __init kfree_rcu_batch_init(void) { int cpu; int i, j; + struct shrinker *kfree_rcu_shrinker; /* Clamp it to [0:100] seconds interval. */ if (rcu_delay_page_cache_fill_msec < 0 || @@ -4962,8 +4956,17 @@ static void __init kfree_rcu_batch_init(void) INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } - if (register_shrinker(&kfree_rcu_shrinker, "rcu-kfree")) - pr_err("Failed to register kfree_rcu() shrinker!\n"); + + kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree"); + if (!kfree_rcu_shrinker) { + pr_err("Failed to allocate kfree_rcu() shrinker!\n"); + return; + } + + kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count; + kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan; + + shrinker_register(kfree_rcu_shrinker); } void __init rcu_init(void) -- cgit v1.2.3 From 54d917295b8366545e6ee940c93dffe1452e6480 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:16 +0800 Subject: mm: thp: dynamically allocate the thp-related shrinkers Use new APIs to dynamically allocate the thp-zero and thp-deferred_split shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-18-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/huge_memory.c | 67 ++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 064fbd90822b..1cfd83e91748 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -65,7 +65,11 @@ unsigned long transparent_hugepage_flags __read_mostly = (1<count_objects = shrink_huge_zero_page_count; + huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; + shrinker_register(huge_zero_page_shrinker); + + deferred_split_shrinker->count_objects = deferred_split_count; + deferred_split_shrinker->scan_objects = deferred_split_scan; + shrinker_register(deferred_split_shrinker); + + return 0; +} + +static void __init thp_shrinker_exit(void) +{ + shrinker_free(huge_zero_page_shrinker); + shrinker_free(deferred_split_shrinker); +} + static int __init hugepage_init(void) { int err; @@ -482,12 +514,9 @@ static int __init hugepage_init(void) if (err) goto err_slab; - err = register_shrinker(&huge_zero_page_shrinker, "thp-zero"); - if (err) - goto err_hzp_shrinker; - err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split"); + err = thp_shrinker_init(); if (err) - goto err_split_shrinker; + goto err_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -505,10 +534,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: - unregister_shrinker(&deferred_split_shrinker); -err_split_shrinker: - unregister_shrinker(&huge_zero_page_shrinker); -err_hzp_shrinker: + thp_shrinker_exit(); +err_shrinker: khugepaged_destroy(); err_slab: hugepage_exit_sysfs(hugepage_kobj); @@ -2828,7 +2855,7 @@ void deferred_split_folio(struct folio *folio) #ifdef CONFIG_MEMCG if (memcg) set_shrinker_bit(memcg, folio_nid(folio), - deferred_split_shrinker.id); + deferred_split_shrinker->id); #endif } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); @@ -2902,14 +2929,6 @@ next: return split; } -static struct shrinker deferred_split_shrinker = { - .count_objects = deferred_split_count, - .scan_objects = deferred_split_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE | - SHRINKER_NONSLAB, -}; - #ifdef CONFIG_DEBUG_FS static void split_huge_pages_all(void) { -- cgit v1.2.3 From abe0c269f9820c637223ad88d77d174e8d85c58f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:17 +0800 Subject: sunrpc: dynamically allocate the sunrpc_cred shrinker Use new APIs to dynamically allocate the sunrpc_cred shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-19-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Jeff Layton Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- net/sunrpc/auth.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 2f16f9d17966..c9c270eececc 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -861,11 +861,7 @@ rpcauth_uptodatecred(struct rpc_task *task) test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } -static struct shrinker rpc_cred_shrinker = { - .count_objects = rpcauth_cache_shrink_count, - .scan_objects = rpcauth_cache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *rpc_cred_shrinker; int __init rpcauth_init_module(void) { @@ -874,9 +870,17 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = register_shrinker(&rpc_cred_shrinker, "sunrpc_cred"); - if (err < 0) + rpc_cred_shrinker = shrinker_alloc(0, "sunrpc_cred"); + if (!rpc_cred_shrinker) { + err = -ENOMEM; goto out2; + } + + rpc_cred_shrinker->count_objects = rpcauth_cache_shrink_count; + rpc_cred_shrinker->scan_objects = rpcauth_cache_shrink_scan; + + shrinker_register(rpc_cred_shrinker); + return 0; out2: rpc_destroy_authunix(); @@ -887,5 +891,5 @@ out1: void rpcauth_remove_module(void) { rpc_destroy_authunix(); - unregister_shrinker(&rpc_cred_shrinker); + shrinker_free(rpc_cred_shrinker); } -- cgit v1.2.3 From 219c666eb2854bb9ca94d04a53ceb14aaf81cb28 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:18 +0800 Subject: mm: workingset: dynamically allocate the mm-shadow shrinker Use new APIs to dynamically allocate the mm-shadow shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-20-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/workingset.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index da58a26d0d4d..b192e44a0e7c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -763,13 +763,6 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, NULL); } -static struct shrinker workingset_shadow_shrinker = { - .count_objects = count_shadow_nodes, - .scan_objects = scan_shadow_nodes, - .seeks = 0, /* ->count reports only fully expendable nodes */ - .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, -}; - /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe * i_pages lock. @@ -778,9 +771,10 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + struct shrinker *workingset_shadow_shrinker; unsigned int timestamp_bits; unsigned int max_order; - int ret; + int ret = -ENOMEM; BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); /* @@ -797,17 +791,26 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow"); - if (ret) + workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | + SHRINKER_MEMCG_AWARE, + "mm-shadow"); + if (!workingset_shadow_shrinker) goto err; + ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - &workingset_shadow_shrinker); + workingset_shadow_shrinker); if (ret) goto err_list_lru; - register_shrinker_prepared(&workingset_shadow_shrinker); + + workingset_shadow_shrinker->count_objects = count_shadow_nodes; + workingset_shadow_shrinker->scan_objects = scan_shadow_nodes; + /* ->count reports only fully expendable nodes */ + workingset_shadow_shrinker->seeks = 0; + + shrinker_register(workingset_shadow_shrinker); return 0; err_list_lru: - free_prealloced_shrinker(&workingset_shadow_shrinker); + shrinker_free(workingset_shadow_shrinker); err: return ret; } -- cgit v1.2.3 From 583cc9e41095292e2ebf33c977d8ba1e64308892 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:19 +0800 Subject: drm/i915: dynamically allocate the i915_gem_mm shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the i915_gem_mm shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct drm_i915_private. Link: https://lkml.kernel.org/r/20230911094444.68966-21-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Tvrtko Ursulin Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_shrinker.c | 29 +++++++++++++++------------- drivers/gpu/drm/i915/i915_drv.h | 2 +- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c index 214763942aa2..e07ffbd9eab3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c @@ -284,8 +284,7 @@ unsigned long i915_gem_shrink_all(struct drm_i915_private *i915) static unsigned long i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct drm_i915_private *i915 = - container_of(shrinker, struct drm_i915_private, mm.shrinker); + struct drm_i915_private *i915 = shrinker->private_data; unsigned long num_objects; unsigned long count; @@ -302,8 +301,8 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) if (num_objects) { unsigned long avg = 2 * count / num_objects; - i915->mm.shrinker.batch = - max((i915->mm.shrinker.batch + avg) >> 1, + i915->mm.shrinker->batch = + max((i915->mm.shrinker->batch + avg) >> 1, 128ul /* default SHRINK_BATCH */); } @@ -313,8 +312,7 @@ i915_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) static unsigned long i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct drm_i915_private *i915 = - container_of(shrinker, struct drm_i915_private, mm.shrinker); + struct drm_i915_private *i915 = shrinker->private_data; unsigned long freed; sc->nr_scanned = 0; @@ -422,12 +420,17 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr void i915_gem_driver_register__shrinker(struct drm_i915_private *i915) { - i915->mm.shrinker.scan_objects = i915_gem_shrinker_scan; - i915->mm.shrinker.count_objects = i915_gem_shrinker_count; - i915->mm.shrinker.seeks = DEFAULT_SEEKS; - i915->mm.shrinker.batch = 4096; - drm_WARN_ON(&i915->drm, register_shrinker(&i915->mm.shrinker, - "drm-i915_gem")); + i915->mm.shrinker = shrinker_alloc(0, "drm-i915_gem"); + if (!i915->mm.shrinker) { + drm_WARN_ON(&i915->drm, 1); + } else { + i915->mm.shrinker->scan_objects = i915_gem_shrinker_scan; + i915->mm.shrinker->count_objects = i915_gem_shrinker_count; + i915->mm.shrinker->batch = 4096; + i915->mm.shrinker->private_data = i915; + + shrinker_register(i915->mm.shrinker); + } i915->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom; drm_WARN_ON(&i915->drm, register_oom_notifier(&i915->mm.oom_notifier)); @@ -443,7 +446,7 @@ void i915_gem_driver_unregister__shrinker(struct drm_i915_private *i915) unregister_vmap_purge_notifier(&i915->mm.vmap_notifier)); drm_WARN_ON(&i915->drm, unregister_oom_notifier(&i915->mm.oom_notifier)); - unregister_shrinker(&i915->mm.shrinker); + shrinker_free(i915->mm.shrinker); } void i915_gem_shrinker_taints_mutex(struct drm_i915_private *i915, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 7a8ce7239bc9..f2f21da4d7f9 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -163,7 +163,7 @@ struct i915_gem_mm { struct notifier_block oom_notifier; struct notifier_block vmap_notifier; - struct shrinker shrinker; + struct shrinker *shrinker; #ifdef CONFIG_MMU_NOTIFIER /** -- cgit v1.2.3 From cd61a76c210afd324c345ff62d950e2ff7947f11 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:20 +0800 Subject: drm/msm: dynamically allocate the drm-msm_gem shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the drm-msm_gem shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct msm_drm_private. Link: https://lkml.kernel.org/r/20230911094444.68966-22-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Daniel Vetter Cc: Rob Clark Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Sean Paul Cc: Marijn Suijten Cc: David Airlie Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/gpu/drm/msm/msm_drv.c | 4 +++- drivers/gpu/drm/msm/msm_drv.h | 4 ++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 33 ++++++++++++++++++++------------- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 4bd028fa7500..7f20249d6071 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -462,7 +462,9 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv) if (ret) goto err_msm_uninit; - msm_gem_shrinker_init(ddev); + ret = msm_gem_shrinker_init(ddev); + if (ret) + goto err_msm_uninit; if (priv->kms_init) { ret = priv->kms_init(ddev); diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 02fd6c7d0bb7..e2fc56f161b5 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -221,7 +221,7 @@ struct msm_drm_private { } vram; struct notifier_block vmap_notifier; - struct shrinker shrinker; + struct shrinker *shrinker; struct drm_atomic_state *pm_state; @@ -283,7 +283,7 @@ int msm_ioctl_gem_submit(struct drm_device *dev, void *data, unsigned long msm_gem_shrinker_shrink(struct drm_device *dev, unsigned long nr_to_scan); #endif -void msm_gem_shrinker_init(struct drm_device *dev); +int msm_gem_shrinker_init(struct drm_device *dev); void msm_gem_shrinker_cleanup(struct drm_device *dev); struct sg_table *msm_gem_prime_get_sg_table(struct drm_gem_object *obj); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index f38296ad8743..5a7d48c02c4b 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -34,8 +34,7 @@ static bool can_block(struct shrink_control *sc) static unsigned long msm_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct msm_drm_private *priv = - container_of(shrinker, struct msm_drm_private, shrinker); + struct msm_drm_private *priv = shrinker->private_data; unsigned count = priv->lru.dontneed.count; if (can_swap()) @@ -100,8 +99,7 @@ active_evict(struct drm_gem_object *obj) static unsigned long msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct msm_drm_private *priv = - container_of(shrinker, struct msm_drm_private, shrinker); + struct msm_drm_private *priv = shrinker->private_data; struct { struct drm_gem_lru *lru; bool (*shrink)(struct drm_gem_object *obj); @@ -148,10 +146,11 @@ msm_gem_shrinker_shrink(struct drm_device *dev, unsigned long nr_to_scan) struct shrink_control sc = { .nr_to_scan = nr_to_scan, }; - int ret; + unsigned long ret = SHRINK_STOP; fs_reclaim_acquire(GFP_KERNEL); - ret = msm_gem_shrinker_scan(&priv->shrinker, &sc); + if (priv->shrinker) + ret = msm_gem_shrinker_scan(priv->shrinker, &sc); fs_reclaim_release(GFP_KERNEL); return ret; @@ -210,16 +209,24 @@ msm_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr) * * This function registers and sets up the msm shrinker. */ -void msm_gem_shrinker_init(struct drm_device *dev) +int msm_gem_shrinker_init(struct drm_device *dev) { struct msm_drm_private *priv = dev->dev_private; - priv->shrinker.count_objects = msm_gem_shrinker_count; - priv->shrinker.scan_objects = msm_gem_shrinker_scan; - priv->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&priv->shrinker, "drm-msm_gem")); + + priv->shrinker = shrinker_alloc(0, "drm-msm_gem"); + if (!priv->shrinker) + return -ENOMEM; + + priv->shrinker->count_objects = msm_gem_shrinker_count; + priv->shrinker->scan_objects = msm_gem_shrinker_scan; + priv->shrinker->private_data = priv; + + shrinker_register(priv->shrinker); priv->vmap_notifier.notifier_call = msm_gem_shrinker_vmap; WARN_ON(register_vmap_purge_notifier(&priv->vmap_notifier)); + + return 0; } /** @@ -232,8 +239,8 @@ void msm_gem_shrinker_cleanup(struct drm_device *dev) { struct msm_drm_private *priv = dev->dev_private; - if (priv->shrinker.nr_deferred) { + if (priv->shrinker) { WARN_ON(unregister_vmap_purge_notifier(&priv->vmap_notifier)); - unregister_shrinker(&priv->shrinker); + shrinker_free(priv->shrinker); } } -- cgit v1.2.3 From e11c4f3acbbb776992425867df894b8c0f131f50 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:21 +0800 Subject: drm/panfrost: dynamically allocate the drm-panfrost shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the drm-panfrost shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct panfrost_device. Link: https://lkml.kernel.org/r/20230911094444.68966-23-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Steven Price Acked-by: Daniel Vetter Cc: Rob Herring Cc: Tomeu Vizoso Cc: Alyssa Rosenzweig Cc: David Airlie Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/gpu/drm/panfrost/panfrost_device.h | 2 +- drivers/gpu/drm/panfrost/panfrost_drv.c | 6 ++++- drivers/gpu/drm/panfrost/panfrost_gem.h | 2 +- drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c | 29 ++++++++++++++---------- 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/panfrost/panfrost_device.h b/drivers/gpu/drm/panfrost/panfrost_device.h index b0126b9fbadc..e667e5689353 100644 --- a/drivers/gpu/drm/panfrost/panfrost_device.h +++ b/drivers/gpu/drm/panfrost/panfrost_device.h @@ -118,7 +118,7 @@ struct panfrost_device { struct mutex shrinker_lock; struct list_head shrinker_list; - struct shrinker shrinker; + struct shrinker *shrinker; struct panfrost_devfreq pfdevfreq; }; diff --git a/drivers/gpu/drm/panfrost/panfrost_drv.c b/drivers/gpu/drm/panfrost/panfrost_drv.c index a2ab99698ca8..e1d0e3a23757 100644 --- a/drivers/gpu/drm/panfrost/panfrost_drv.c +++ b/drivers/gpu/drm/panfrost/panfrost_drv.c @@ -601,10 +601,14 @@ static int panfrost_probe(struct platform_device *pdev) if (err < 0) goto err_out1; - panfrost_gem_shrinker_init(ddev); + err = panfrost_gem_shrinker_init(ddev); + if (err) + goto err_out2; return 0; +err_out2: + drm_dev_unregister(ddev); err_out1: pm_runtime_disable(pfdev->dev); panfrost_device_fini(pfdev); diff --git a/drivers/gpu/drm/panfrost/panfrost_gem.h b/drivers/gpu/drm/panfrost/panfrost_gem.h index ad2877eeeccd..863d2ec8d4f0 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem.h +++ b/drivers/gpu/drm/panfrost/panfrost_gem.h @@ -81,7 +81,7 @@ panfrost_gem_mapping_get(struct panfrost_gem_object *bo, void panfrost_gem_mapping_put(struct panfrost_gem_mapping *mapping); void panfrost_gem_teardown_mappings_locked(struct panfrost_gem_object *bo); -void panfrost_gem_shrinker_init(struct drm_device *dev); +int panfrost_gem_shrinker_init(struct drm_device *dev); void panfrost_gem_shrinker_cleanup(struct drm_device *dev); #endif /* __PANFROST_GEM_H__ */ diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c index 6a71a2555f85..3d9f51bd48b6 100644 --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c @@ -18,8 +18,7 @@ static unsigned long panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct panfrost_device *pfdev = - container_of(shrinker, struct panfrost_device, shrinker); + struct panfrost_device *pfdev = shrinker->private_data; struct drm_gem_shmem_object *shmem; unsigned long count = 0; @@ -65,8 +64,7 @@ unlock_mappings: static unsigned long panfrost_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct panfrost_device *pfdev = - container_of(shrinker, struct panfrost_device, shrinker); + struct panfrost_device *pfdev = shrinker->private_data; struct drm_gem_shmem_object *shmem, *tmp; unsigned long freed = 0; @@ -97,13 +95,21 @@ panfrost_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) * * This function registers and sets up the panfrost shrinker. */ -void panfrost_gem_shrinker_init(struct drm_device *dev) +int panfrost_gem_shrinker_init(struct drm_device *dev) { struct panfrost_device *pfdev = dev->dev_private; - pfdev->shrinker.count_objects = panfrost_gem_shrinker_count; - pfdev->shrinker.scan_objects = panfrost_gem_shrinker_scan; - pfdev->shrinker.seeks = DEFAULT_SEEKS; - WARN_ON(register_shrinker(&pfdev->shrinker, "drm-panfrost")); + + pfdev->shrinker = shrinker_alloc(0, "drm-panfrost"); + if (!pfdev->shrinker) + return -ENOMEM; + + pfdev->shrinker->count_objects = panfrost_gem_shrinker_count; + pfdev->shrinker->scan_objects = panfrost_gem_shrinker_scan; + pfdev->shrinker->private_data = pfdev; + + shrinker_register(pfdev->shrinker); + + return 0; } /** @@ -116,7 +122,6 @@ void panfrost_gem_shrinker_cleanup(struct drm_device *dev) { struct panfrost_device *pfdev = dev->dev_private; - if (pfdev->shrinker.nr_deferred) { - unregister_shrinker(&pfdev->shrinker); - } + if (pfdev->shrinker) + shrinker_free(pfdev->shrinker); } -- cgit v1.2.3 From 1f1d459c9a2f0c8618b137fe845c09e405ae59fc Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:22 +0800 Subject: dm: dynamically allocate the dm-bufio shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the dm-bufio shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct dm_bufio_client. Link: https://lkml.kernel.org/r/20230911094444.68966-24-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Abhinav Kumar Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/dm-bufio.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index bc309e41d074..62eb27639c9b 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -963,7 +963,7 @@ struct dm_bufio_client { sector_t start; - struct shrinker shrinker; + struct shrinker *shrinker; struct work_struct shrink_work; atomic_long_t need_shrink; @@ -2368,7 +2368,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink { struct dm_bufio_client *c; - c = container_of(shrink, struct dm_bufio_client, shrinker); + c = shrink->private_data; atomic_long_add(sc->nr_to_scan, &c->need_shrink); queue_work(dm_bufio_wq, &c->shrink_work); @@ -2377,7 +2377,7 @@ static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker); + struct dm_bufio_client *c = shrink->private_data; unsigned long count = cache_total(&c->cache); unsigned long retain_target = get_retain_buffers(c); unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink); @@ -2490,14 +2490,20 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign INIT_WORK(&c->shrink_work, shrink_work); atomic_long_set(&c->need_shrink, 0); - c->shrinker.count_objects = dm_bufio_shrink_count; - c->shrinker.scan_objects = dm_bufio_shrink_scan; - c->shrinker.seeks = 1; - c->shrinker.batch = 0; - r = register_shrinker(&c->shrinker, "dm-bufio:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (r) + c->shrinker = shrinker_alloc(0, "dm-bufio:(%u:%u)", + MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + if (!c->shrinker) { + r = -ENOMEM; goto bad; + } + + c->shrinker->count_objects = dm_bufio_shrink_count; + c->shrinker->scan_objects = dm_bufio_shrink_scan; + c->shrinker->seeks = 1; + c->shrinker->batch = 0; + c->shrinker->private_data = c; + + shrinker_register(c->shrinker); mutex_lock(&dm_bufio_clients_lock); dm_bufio_client_count++; @@ -2537,7 +2543,7 @@ void dm_bufio_client_destroy(struct dm_bufio_client *c) drop_buffers(c); - unregister_shrinker(&c->shrinker); + shrinker_free(c->shrinker); flush_work(&c->shrink_work); mutex_lock(&dm_bufio_clients_lock); -- cgit v1.2.3 From ba3d6acafd26416c3001adb4cc3cedcd6a419c73 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:23 +0800 Subject: dm zoned: dynamically allocate the dm-zoned-meta shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the dm-zoned-meta shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct dmz_metadata. Link: https://lkml.kernel.org/r/20230911094444.68966-25-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alasdair Kergon Cc: Mike Snitzer Cc: Abhinav Kumar Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/dm-zoned-metadata.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index 9d3cca8e3dc9..60a4dc01ea18 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -187,7 +187,7 @@ struct dmz_metadata { struct rb_root mblk_rbtree; struct list_head mblk_lru_list; struct list_head mblk_dirty_list; - struct shrinker mblk_shrinker; + struct shrinker *mblk_shrinker; /* Zone allocation management */ struct mutex map_lock; @@ -615,7 +615,7 @@ static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd, static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; return atomic_read(&zmd->nr_mblks); } @@ -626,7 +626,7 @@ static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink, static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker); + struct dmz_metadata *zmd = shrink->private_data; unsigned long count; spin_lock(&zmd->mblk_lock); @@ -2936,19 +2936,23 @@ int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev, */ zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16; zmd->max_nr_mblks = zmd->min_nr_mblks + 512; - zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count; - zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan; - zmd->mblk_shrinker.seeks = DEFAULT_SEEKS; /* Metadata cache shrinker */ - ret = register_shrinker(&zmd->mblk_shrinker, "dm-zoned-meta:(%u:%u)", - MAJOR(dev->bdev->bd_dev), - MINOR(dev->bdev->bd_dev)); - if (ret) { - dmz_zmd_err(zmd, "Register metadata cache shrinker failed"); + zmd->mblk_shrinker = shrinker_alloc(0, "dm-zoned-meta:(%u:%u)", + MAJOR(dev->bdev->bd_dev), + MINOR(dev->bdev->bd_dev)); + if (!zmd->mblk_shrinker) { + ret = -ENOMEM; + dmz_zmd_err(zmd, "Allocate metadata cache shrinker failed"); goto err; } + zmd->mblk_shrinker->count_objects = dmz_mblock_shrinker_count; + zmd->mblk_shrinker->scan_objects = dmz_mblock_shrinker_scan; + zmd->mblk_shrinker->private_data = zmd; + + shrinker_register(zmd->mblk_shrinker); + dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version); for (i = 0; i < zmd->nr_devs; i++) dmz_print_dev(zmd, i); @@ -2995,7 +2999,7 @@ err: */ void dmz_dtr_metadata(struct dmz_metadata *zmd) { - unregister_shrinker(&zmd->mblk_shrinker); + shrinker_free(zmd->mblk_shrinker); dmz_cleanup_metadata(zmd); kfree(zmd); } -- cgit v1.2.3 From 86298d8b8ceacc17d0192cd6412d2773ff51b27f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:24 +0800 Subject: md/raid5: dynamically allocate the md-raid5 shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the md-raid5 shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct r5conf. Link: https://lkml.kernel.org/r/20230911094444.68966-26-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Song Liu Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/raid5.c | 26 +++++++++++++++----------- drivers/md/raid5.h | 2 +- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4cb9c608ee19..c8d2c6e50aa1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7401,7 +7401,7 @@ static void free_conf(struct r5conf *conf) log_exit(conf); - unregister_shrinker(&conf->shrinker); + shrinker_free(conf->shrinker); free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); @@ -7449,7 +7449,7 @@ static int raid5_alloc_percpu(struct r5conf *conf) static unsigned long raid5_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; unsigned long ret = SHRINK_STOP; if (mutex_trylock(&conf->cache_size_mutex)) { @@ -7470,7 +7470,7 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink, static unsigned long raid5_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct r5conf *conf = container_of(shrink, struct r5conf, shrinker); + struct r5conf *conf = shrink->private_data; if (conf->max_nr_stripes < conf->min_nr_stripes) /* unlikely, but not impossible */ @@ -7705,18 +7705,22 @@ static struct r5conf *setup_conf(struct mddev *mddev) * it reduces the queue depth and so can hurt throughput. * So set it rather large, scaled by number of devices. */ - conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4; - conf->shrinker.scan_objects = raid5_cache_scan; - conf->shrinker.count_objects = raid5_cache_count; - conf->shrinker.batch = 128; - conf->shrinker.flags = 0; - ret = register_shrinker(&conf->shrinker, "md-raid5:%s", mdname(mddev)); - if (ret) { - pr_warn("md/raid:%s: couldn't register shrinker.\n", + conf->shrinker = shrinker_alloc(0, "md-raid5:%s", mdname(mddev)); + if (!conf->shrinker) { + ret = -ENOMEM; + pr_warn("md/raid:%s: couldn't allocate shrinker.\n", mdname(mddev)); goto abort; } + conf->shrinker->seeks = DEFAULT_SEEKS * conf->raid_disks * 4; + conf->shrinker->scan_objects = raid5_cache_scan; + conf->shrinker->count_objects = raid5_cache_count; + conf->shrinker->batch = 128; + conf->shrinker->private_data = conf; + + shrinker_register(conf->shrinker); + sprintf(pers_name, "raid%d", mddev->new_level); rcu_assign_pointer(conf->thread, md_register_thread(raid5d, mddev, pers_name)); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 97a795979a35..22bea20eccbd 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -670,7 +670,7 @@ struct r5conf { wait_queue_head_t wait_for_stripe; wait_queue_head_t wait_for_overlap; unsigned long cache_state; - struct shrinker shrinker; + struct shrinker *shrinker; int pool_size; /* number of disks in stripeheads in pool */ spinlock_t device_lock; struct disk_info *disks; -- cgit v1.2.3 From a6a1eb6214cf7adfd1e2da33d819df17fe8a64d7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:25 +0800 Subject: bcache: dynamically allocate the md-bcache shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the md-bcache shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct cache_set. Link: https://lkml.kernel.org/r/20230911094444.68966-27-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Muchun Song Cc: Coly Li Cc: Kent Overstreet Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/btree.c | 27 ++++++++++++++++----------- drivers/md/bcache/sysfs.c | 3 ++- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 5a79bb3c272f..c622bc50f81b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -541,7 +541,7 @@ struct cache_set { struct bio_set bio_split; /* For the btree cache */ - struct shrinker shrink; + struct shrinker *shrink; /* For the btree cache and anything allocation related */ struct mutex bucket_lock; diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index fd121a61f17c..ae5cbb55861f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -667,7 +667,7 @@ out_unlock: static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; @@ -734,7 +734,7 @@ out: static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { - struct cache_set *c = container_of(shrink, struct cache_set, shrink); + struct cache_set *c = shrink->private_data; if (c->shrinker_disabled) return 0; @@ -752,8 +752,8 @@ void bch_btree_cache_free(struct cache_set *c) closure_init_stack(&cl); - if (c->shrink.list.next) - unregister_shrinker(&c->shrink); + if (c->shrink) + shrinker_free(c->shrink); mutex_lock(&c->bucket_lock); @@ -828,14 +828,19 @@ int bch_btree_cache_alloc(struct cache_set *c) c->verify_data = NULL; #endif - c->shrink.count_objects = bch_mca_count; - c->shrink.scan_objects = bch_mca_scan; - c->shrink.seeks = 4; - c->shrink.batch = c->btree_pages * 2; + c->shrink = shrinker_alloc(0, "md-bcache:%pU", c->set_uuid); + if (!c->shrink) { + pr_warn("bcache: %s: could not allocate shrinker\n", __func__); + return 0; + } + + c->shrink->count_objects = bch_mca_count; + c->shrink->scan_objects = bch_mca_scan; + c->shrink->seeks = 4; + c->shrink->batch = c->btree_pages * 2; + c->shrink->private_data = c; - if (register_shrinker(&c->shrink, "md-bcache:%pU", c->set_uuid)) - pr_warn("bcache: %s: could not register shrinker\n", - __func__); + shrinker_register(c->shrink); return 0; } diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 0e2c1880f60b..45d8af755de6 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -866,7 +866,8 @@ STORE(__bch_cache_set) sc.gfp_mask = GFP_KERNEL; sc.nr_to_scan = strtoul_or_return(buf); - c->shrink.scan_objects(&c->shrink, &sc); + if (c->shrink) + c->shrink->scan_objects(c->shrink, &sc); } sysfs_strtoul_clamp(congested_read_threshold_us, -- cgit v1.2.3 From 17c4eb036a37f079ce2857be5914f839623b4eb2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:26 +0800 Subject: vmw_balloon: dynamically allocate the vmw-balloon shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the vmw-balloon shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct vmballoon. And we can simply exit vmballoon_init() when registering the shrinker fails. So the shrinker_registered indication is redundant, just remove it. Link: https://lkml.kernel.org/r/20230911094444.68966-28-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Nadav Amit Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/misc/vmw_balloon.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) diff --git a/drivers/misc/vmw_balloon.c b/drivers/misc/vmw_balloon.c index 9ce9b9e0e9b6..c817d8c21641 100644 --- a/drivers/misc/vmw_balloon.c +++ b/drivers/misc/vmw_balloon.c @@ -380,16 +380,7 @@ struct vmballoon { /** * @shrinker: shrinker interface that is used to avoid over-inflation. */ - struct shrinker shrinker; - - /** - * @shrinker_registered: whether the shrinker was registered. - * - * The shrinker interface does not handle gracefully the removal of - * shrinker that was not registered before. This indication allows to - * simplify the unregistration process. - */ - bool shrinker_registered; + struct shrinker *shrinker; }; static struct vmballoon balloon; @@ -1568,29 +1559,27 @@ static unsigned long vmballoon_shrinker_count(struct shrinker *shrinker, static void vmballoon_unregister_shrinker(struct vmballoon *b) { - if (b->shrinker_registered) - unregister_shrinker(&b->shrinker); - b->shrinker_registered = false; + shrinker_free(b->shrinker); + b->shrinker = NULL; } static int vmballoon_register_shrinker(struct vmballoon *b) { - int r; - /* Do nothing if the shrinker is not enabled */ if (!vmwballoon_shrinker_enable) return 0; - b->shrinker.scan_objects = vmballoon_shrinker_scan; - b->shrinker.count_objects = vmballoon_shrinker_count; - b->shrinker.seeks = DEFAULT_SEEKS; + b->shrinker = shrinker_alloc(0, "vmw-balloon"); + if (!b->shrinker) + return -ENOMEM; - r = register_shrinker(&b->shrinker, "vmw-balloon"); + b->shrinker->scan_objects = vmballoon_shrinker_scan; + b->shrinker->count_objects = vmballoon_shrinker_count; + b->shrinker->private_data = b; - if (r == 0) - b->shrinker_registered = true; + shrinker_register(b->shrinker); - return r; + return 0; } /* @@ -1883,7 +1872,7 @@ static int __init vmballoon_init(void) error = vmballoon_register_shrinker(&balloon); if (error) - goto fail; + return error; /* * Initialization of compaction must be done after the call to @@ -1905,9 +1894,6 @@ static int __init vmballoon_init(void) vmballoon_debugfs_init(&balloon); return 0; -fail: - vmballoon_unregister_shrinker(&balloon); - return error; } /* -- cgit v1.2.3 From 0fbb99698b165444e3cf5421f96de01d9d56baf4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:27 +0800 Subject: virtio_balloon: dynamically allocate the virtio-balloon shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the virtio-balloon shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct virtio_balloon. Link: https://lkml.kernel.org/r/20230911094444.68966-29-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: "Michael S. Tsirkin" Cc: David Hildenbrand Cc: Jason Wang Cc: Xuan Zhuo Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/virtio/virtio_balloon.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 5b15936a5214..5cdc7b081baa 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -111,7 +111,7 @@ struct virtio_balloon { struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; /* Shrinker to return free pages - VIRTIO_BALLOON_F_FREE_PAGE_HINT */ - struct shrinker shrinker; + struct shrinker *shrinker; /* OOM notifier to deflate on OOM - VIRTIO_BALLOON_F_DEFLATE_ON_OOM */ struct notifier_block oom_nb; @@ -816,8 +816,7 @@ static unsigned long shrink_free_pages(struct virtio_balloon *vb, static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + struct virtio_balloon *vb = shrinker->private_data; return shrink_free_pages(vb, sc->nr_to_scan); } @@ -825,8 +824,7 @@ static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker, static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { - struct virtio_balloon *vb = container_of(shrinker, - struct virtio_balloon, shrinker); + struct virtio_balloon *vb = shrinker->private_data; return vb->num_free_page_blocks * VIRTIO_BALLOON_HINT_BLOCK_PAGES; } @@ -847,16 +845,22 @@ static int virtio_balloon_oom_notify(struct notifier_block *nb, static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb) { - unregister_shrinker(&vb->shrinker); + shrinker_free(vb->shrinker); } static int virtio_balloon_register_shrinker(struct virtio_balloon *vb) { - vb->shrinker.scan_objects = virtio_balloon_shrinker_scan; - vb->shrinker.count_objects = virtio_balloon_shrinker_count; - vb->shrinker.seeks = DEFAULT_SEEKS; + vb->shrinker = shrinker_alloc(0, "virtio-balloon"); + if (!vb->shrinker) + return -ENOMEM; - return register_shrinker(&vb->shrinker, "virtio-balloon"); + vb->shrinker->scan_objects = virtio_balloon_shrinker_scan; + vb->shrinker->count_objects = virtio_balloon_shrinker_count; + vb->shrinker->private_data = vb; + + shrinker_register(vb->shrinker); + + return 0; } static int virtballoon_probe(struct virtio_device *vdev) -- cgit v1.2.3 From 714e5bde00a58ba13b03b68547931e011f93de58 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:28 +0800 Subject: mbcache: dynamically allocate the mbcache shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mbcache shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct mb_cache. Link: https://lkml.kernel.org/r/20230911094444.68966-30-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/mbcache.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/mbcache.c b/fs/mbcache.c index 2a4b8b549e93..82aa7a35db26 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -37,7 +37,7 @@ struct mb_cache { struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; - struct shrinker c_shrink; + struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; @@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } @@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } @@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits) for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - cache->c_shrink.count_objects = mb_cache_count; - cache->c_shrink.scan_objects = mb_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { + cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); + if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } + cache->c_shrink->count_objects = mb_cache_count; + cache->c_shrink->scan_objects = mb_cache_scan; + cache->c_shrink->private_data = cache; + + shrinker_register(cache->c_shrink); + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; @@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; - unregister_shrinker(&cache->c_shrink); + shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this -- cgit v1.2.3 From 4d09d75d8b8cb3c8087b46585924777bc8737663 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:29 +0800 Subject: ext4: dynamically allocate the ext4-es shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the ext4-es shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct ext4_sb_info. Link: https://lkml.kernel.org/r/20230911094444.68966-31-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/ext4/ext4.h | 2 +- fs/ext4/extents_status.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9418359b1d9d..8eeff770992c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1653,7 +1653,7 @@ struct ext4_sb_info { __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 6f7de14c0fa8..deec7d1f4e50 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1606,7 +1606,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1615,8 +1615,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1700,13 +1699,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1726,7 +1729,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* -- cgit v1.2.3 From 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:30 +0800 Subject: jbd2,ext4: dynamically allocate the jbd2-journal shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the jbd2-journal shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct journal_s. Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: "Theodore Ts'o" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/jbd2/journal.c | 29 ++++++++++++++++++----------- include/linux/jbd2.h | 2 +- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 30dec2bd2ecc..ed53188472f9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1290,7 +1290,7 @@ static int jbd2_min_tag_size(void) static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long nr_to_scan = sc->nr_to_scan; unsigned long nr_shrunk; unsigned long count; @@ -1316,7 +1316,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); @@ -1588,14 +1588,21 @@ static journal_t *journal_init_common(struct block_device *bdev, goto err_cleanup; journal->j_shrink_transaction = NULL; - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (err) + + journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!journal->j_shrinker) { + err = -ENOMEM; goto err_cleanup; + } + + journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker->count_objects = jbd2_journal_shrink_count; + journal->j_shrinker->batch = journal->j_max_transaction_buffers; + journal->j_shrinker->private_data = journal; + + shrinker_register(journal->j_shrinker); return journal; @@ -2172,9 +2179,9 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } - if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + if (journal->j_shrinker) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); - unregister_shrinker(&journal->j_shrinker); + shrinker_free(journal->j_shrinker); } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 52772c826c86..6dcbb4eb80fb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -886,7 +886,7 @@ struct journal_s * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ - struct shrinker j_shrinker; + struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: -- cgit v1.2.3 From d17452aa33a6fd0e9cf2bd3d43237f2d2c294a7e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:31 +0800 Subject: nfsd: dynamically allocate the nfsd-client shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-client shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-33-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Cc: Jeff Layton Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfs4state.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ec49b200b797..f669444d5336 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -195,7 +195,7 @@ struct nfsd_net { int nfs4_max_clients; atomic_t nfsd_courtesy_clients; - struct shrinker nfsd_client_shrinker; + struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8534693eb6a4..23b3b38c8cda 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4400,8 +4400,7 @@ static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_client_shrinker); + struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -8149,12 +8148,16 @@ static int nfs4_state_create_net(struct net *net) INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - - if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); + if (!nn->nfsd_client_shrinker) goto err_shrinker; + + nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_client_shrinker); + return 0; err_shrinker: @@ -8252,7 +8255,7 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_shrinker(&nn->nfsd_client_shrinker); + shrinker_free(nn->nfsd_client_shrinker); cancel_work(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); -- cgit v1.2.3 From 8eea99a81c6f67c08fedd7db2ccd09aa93db69a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:32 +0800 Subject: nfsd: dynamically allocate the nfsd-reply shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-reply shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-34-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Acked-by: Muchun Song Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfscache.c | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index f669444d5336..ab303a8b77d5 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -177,7 +177,7 @@ struct nfsd_net { /* size of cache when we saw the longest hash chain */ unsigned int longest_chain_cachesize; - struct shrinker nfsd_reply_cache_shrinker; + struct shrinker *nfsd_reply_cache_shrinker; /* tracking server-to-server copy mounts */ spinlock_t nfsd_ssc_lock; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80621a709510..fd56a52aa5fb 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -201,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; unsigned int i; - int status = 0; nn->max_drc_entries = nfsd_cache_size_limit(); atomic_set(&nn->num_drc_entries, 0); hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; - nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; - nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker, - "nfsd-reply:%s", nn->nfsd_name); - if (status) - return status; - nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); if (!nn->drc_hashtbl) + return -ENOMEM; + + nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s", + nn->nfsd_name); + if (!nn->nfsd_reply_cache_shrinker) goto out_shrinker; + nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker->seeks = 1; + nn->nfsd_reply_cache_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_reply_cache_shrinker); + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); spin_lock_init(&nn->drc_hashtbl[i].cache_lock); @@ -229,7 +232,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + kvfree(nn->drc_hashtbl); printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -239,7 +242,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct nfsd_cacherep *rp; unsigned int i; - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + shrinker_free(nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; @@ -323,8 +326,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; return atomic_read(&nn->num_drc_entries); } @@ -343,8 +345,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; unsigned long freed = 0; LIST_HEAD(dispose); unsigned int i; -- cgit v1.2.3 From 17e7a00e60c8fff31082d5216b5c7d06d156f972 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:33 +0800 Subject: xfs: dynamically allocate the xfs-buf shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-buf shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_buftarg. Link: https://lkml.kernel.org/r/20230911094444.68966-35-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_buf.c | 24 +++++++++++++----------- fs/xfs/xfs_buf.h | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c1ece4a08ff4..9e7ba04572db 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; LIST_HEAD(dispose); unsigned long freed; @@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; return list_lru_shrink_count(&btp->bt_lru, sc); } @@ -1947,7 +1945,7 @@ xfs_free_buftarg( { struct block_device *bdev = btp->bt_bdev; - unregister_shrinker(&btp->bt_shrinker); + shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); @@ -2031,13 +2029,17 @@ xfs_alloc_buftarg( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; - btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker.seeks = DEFAULT_SEEKS; - btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", - mp->m_super->s_id)) + btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", + mp->m_super->s_id); + if (!btp->bt_shrinker) goto error_pcpu; + + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + + shrinker_register(btp->bt_shrinker); + return btp; error_pcpu: diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index df8f47953bb4..702e7d9ea2ac 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -108,7 +108,7 @@ typedef struct xfs_buftarg { size_t bt_logical_sectormask; /* LRU control structures */ - struct shrinker bt_shrinker; + struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_io_count; -- cgit v1.2.3 From 1a86a53da46778a583cd866465e18d662d866f30 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:34 +0800 Subject: xfs: dynamically allocate the xfs-inodegc shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-inodegc shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_mount. Link: https://lkml.kernel.org/r/20230911094444.68966-36-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_icache.c | 26 +++++++++++++++----------- fs/xfs/xfs_mount.c | 4 ++-- fs/xfs/xfs_mount.h | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c210ac83713..dba514a2c84d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2165,8 +2165,7 @@ xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; @@ -2187,8 +2186,7 @@ xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; bool no_items = true; @@ -2224,13 +2222,19 @@ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { - struct shrinker *shrink = &mp->m_inodegc_shrinker; + mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, + "xfs-inodegc:%s", + mp->m_super->s_id); + if (!mp->m_inodegc_shrinker) + return -ENOMEM; + + mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; + mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; + mp->m_inodegc_shrinker->seeks = 0; + mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->private_data = mp; - shrink->count_objects = xfs_inodegc_shrinker_count; - shrink->scan_objects = xfs_inodegc_shrinker_scan; - shrink->seeks = 0; - shrink->flags = SHRINKER_NONSLAB; - shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + shrinker_register(mp->m_inodegc_shrinker); - return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); + return 0; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0a0fd19573d8..aed5be5508fe 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1021,7 +1021,7 @@ xfs_mountfs( out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -1104,7 +1104,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d19cca099bc3..219681d29fbc 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,7 @@ typedef struct xfs_mount { atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ - struct shrinker m_inodegc_shrinker; + struct shrinker *m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. -- cgit v1.2.3 From fe88527852be8a6f1391c961db38d342cc46461d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:35 +0800 Subject: xfs: dynamically allocate the xfs-qm shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-qm shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_quotainfo. Link: https://lkml.kernel.org/r/20230911094444.68966-37-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_qm.c | 27 ++++++++++++++------------- fs/xfs/xfs_qm.h | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 086e78a6143a..94a7932ac570 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -504,8 +504,7 @@ xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; struct xfs_qm_isolate isol; unsigned long freed; int error; @@ -539,8 +538,7 @@ xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; return list_lru_shrink_count(&qi->qi_lru, sc); } @@ -680,15 +678,18 @@ xfs_qm_init_quotainfo( if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); - qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; - qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; - qinf->qi_shrinker.seeks = DEFAULT_SEEKS; - qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - - error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", - mp->m_super->s_id); - if (error) + qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s", + mp->m_super->s_id); + if (!qinf->qi_shrinker) { + error = -ENOMEM; goto out_free_inos; + } + + qinf->qi_shrinker->count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker->private_data = qinf; + + shrinker_register(qinf->qi_shrinker); return 0; @@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - unregister_shrinker(&qi->qi_shrinker); + shrinker_free(qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9683f0457d19..d5c9fc4ba591 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -63,7 +63,7 @@ struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker *qi_shrinker; /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; -- cgit v1.2.3 From c19b548b493455cfb80895074687152869e77742 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:36 +0800 Subject: zsmalloc: dynamically allocate the mm-zspool shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mm-zspool shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct zs_pool. Link: https://lkml.kernel.org/r/20230911094444.68966-38-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b58f957429f0..c743ce7a5f49 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -229,7 +229,7 @@ struct zs_pool { struct zs_pool_stats stats; /* Compact classes */ - struct shrinker shrinker; + struct shrinker *shrinker; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; @@ -2086,8 +2086,7 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long pages_freed; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; /* * Compact classes and calculate compaction delta. @@ -2105,8 +2104,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, int i; struct size_class *class; unsigned long pages_to_free = 0; - struct zs_pool *pool = container_of(shrinker, struct zs_pool, - shrinker); + struct zs_pool *pool = shrinker->private_data; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; @@ -2121,18 +2119,23 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - unregister_shrinker(&pool->shrinker); + shrinker_free(pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) { - pool->shrinker.scan_objects = zs_shrinker_scan; - pool->shrinker.count_objects = zs_shrinker_count; - pool->shrinker.batch = 0; - pool->shrinker.seeks = DEFAULT_SEEKS; + pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); + if (!pool->shrinker) + return -ENOMEM; + + pool->shrinker->scan_objects = zs_shrinker_scan; + pool->shrinker->count_objects = zs_shrinker_count; + pool->shrinker->batch = 0; + pool->shrinker->private_data = pool; - return register_shrinker(&pool->shrinker, "mm-zspool:%s", - pool->name); + shrinker_register(pool->shrinker); + + return 0; } static int calculate_zspage_chain_size(int class_size) -- cgit v1.2.3 From 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:37 +0800 Subject: fs: super: dynamically allocate the s_shrink In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the s_shrink, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct super_block. Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/btrfs/super.c | 2 +- fs/kernfs/mount.c | 2 +- fs/proc/root.c | 2 +- fs/super.c | 33 ++++++++++++++++++--------------- include/linux/fs.h | 2 +- 5 files changed, 22 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1a093ec0f7e3..b1798bed68f2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1519,7 +1519,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c4bf26142eec..79b96e74a8a0 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -265,7 +265,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ - sb->s_shrink.seeks = 0; + sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9191248f2dac..b55dbc70287b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; + s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); diff --git a/fs/super.c b/fs/super.c index 2d762ce67f6e..adadf6689611 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, long dentries; long inodes; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want @@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, struct super_block *sb; long total_objects = 0; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability @@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s) security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - free_prealloced_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.scan_objects = super_cache_scan; - s->s_shrink.count_objects = super_cache_count; - s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, + "sb-%s", type->name); + if (!s->s_shrink) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) + + s->s_shrink->scan_objects = super_cache_scan; + s->s_shrink->count_objects = super_cache_count; + s->s_shrink->batch = 1024; + s->s_shrink->private_data = s; + + if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) + if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; @@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - unregister_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); @@ -818,7 +821,7 @@ retry: hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; share_extant_sb: @@ -901,7 +904,7 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); @@ -1522,7 +1525,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); - shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, + shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index b528f063e8ff..fd539c9fef8e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1265,7 +1265,7 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - struct shrinker s_shrink; /* per-sb shrinker handle */ + struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; -- cgit v1.2.3 From f2383e01507eeee8a1c1283d61a117a97d6c4ebe Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:38 +0800 Subject: mm: shrinker: remove old APIs Now no users are using the old APIs, just remove them. Link: https://lkml.kernel.org/r/20230911094444.68966-40-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 8 --- mm/shrinker.c | 143 ----------------------------------------------- 2 files changed, 151 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index f4a5249f00b2..1d3899f37229 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -105,14 +105,6 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); -extern int __printf(2, 3) prealloc_shrinker(struct shrinker *shrinker, - const char *fmt, ...); -extern void register_shrinker_prepared(struct shrinker *shrinker); -extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, - const char *fmt, ...); -extern void unregister_shrinker(struct shrinker *shrinker); -extern void free_prealloced_shrinker(struct shrinker *shrinker); - #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index d1032a4d5684..736fa67e8454 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -655,146 +655,3 @@ void shrinker_free(struct shrinker *shrinker) kfree(shrinker); } EXPORT_SYMBOL_GPL(shrinker_free); - -/* - * Add a shrinker callback to be called from the vm. - */ -static int __prealloc_shrinker(struct shrinker *shrinker) -{ - unsigned int size; - int err; - - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); - if (err != -ENOSYS) - return err; - - shrinker->flags &= ~SHRINKER_MEMCG_AWARE; - } - - size = sizeof(*shrinker->nr_deferred); - if (shrinker->flags & SHRINKER_NUMA_AWARE) - size *= nr_node_ids; - - shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); - if (!shrinker->nr_deferred) - return -ENOMEM; - - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __prealloc_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - - return err; -} -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif - -void free_prealloced_shrinker(struct shrinker *shrinker) -{ -#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; -#endif - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); - up_write(&shrinker_rwsem); - return; - } - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} - -void register_shrinker_prepared(struct shrinker *shrinker) -{ - down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); - shrinker->flags |= SHRINKER_REGISTERED; - shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); -} - -static int __register_shrinker(struct shrinker *shrinker) -{ - int err = __prealloc_shrinker(shrinker); - - if (err) - return err; - register_shrinker_prepared(shrinker); - return 0; -} - -#ifdef CONFIG_SHRINKER_DEBUG -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - va_list ap; - int err; - - va_start(ap, fmt); - shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); - va_end(ap); - if (!shrinker->name) - return -ENOMEM; - - err = __register_shrinker(shrinker); - if (err) { - kfree_const(shrinker->name); - shrinker->name = NULL; - } - return err; -} -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif -EXPORT_SYMBOL(register_shrinker); - -/* - * Remove one - */ -void unregister_shrinker(struct shrinker *shrinker) -{ - struct dentry *debugfs_entry; - int debugfs_id; - - if (!(shrinker->flags & SHRINKER_REGISTERED)) - return; - - down_write(&shrinker_rwsem); - list_del(&shrinker->list); - shrinker->flags &= ~SHRINKER_REGISTERED; - if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); - debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - up_write(&shrinker_rwsem); - - shrinker_debugfs_remove(debugfs_entry, debugfs_id); - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -} -EXPORT_SYMBOL(unregister_shrinker); -- cgit v1.2.3 From 307bececcd1205bcb67a3c0d53a69db237ccc9d4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:39 +0800 Subject: mm: shrinker: add a secondary array for shrinker_info::{map, nr_deferred} Currently, we maintain two linear arrays per node per memcg, which are shrinker_info::map and shrinker_info::nr_deferred. And we need to resize them when the shrinker_nr_max is exceeded, that is, allocate a new array, and then copy the old array to the new array, and finally free the old array by RCU. For shrinker_info::map, we do set_bit() under the RCU lock, so we may set the value into the old map which is about to be freed. This may cause the value set to be lost. The current solution is not to copy the old map when resizing, but to set all the corresponding bits in the new map to 1. This solves the data loss problem, but bring the overhead of more pointless loops while doing memcg slab shrink. For shrinker_info::nr_deferred, we will only modify it under the read lock of shrinker_rwsem, so it will not run concurrently with the resizing. But after we make memcg slab shrink lockless, there will be the same data loss problem as shrinker_info::map, and we can't work around it like the map. For such resizable arrays, the most straightforward idea is to change it to xarray, like we did for list_lru [1]. We need to do xa_store() in the list_lru_add()-->set_shrinker_bit(), but this will cause memory allocation, and the list_lru_add() doesn't accept failure. A possible solution is to pre-allocate, but the location of pre-allocation is not well determined (such as deferred_split_shrinker case). Therefore, this commit chooses to introduce the following secondary array for shrinker_info::{map, nr_deferred}: +---------------+--------+--------+-----+ | shrinker_info | unit 0 | unit 1 | ... | (secondary array) +---------------+--------+--------+-----+ | v +---------------+-----+ | nr_deferred[] | map | (leaf array) +---------------+-----+ (shrinker_info_unit) The leaf array is never freed unless the memcg is destroyed. The secondary array will be resized every time the shrinker id exceeds shrinker_nr_max. So the shrinker_info_unit can be indexed from both the old and the new shrinker_info->unit[x]. Then even if we get the old secondary array under the RCU lock, the found map and nr_deferred are also true, so the updated nr_deferred and map will not be lost. [1]. https://lore.kernel.org/all/20220228122126.37293-13-songmuchun@bytedance.com/ [zhengqi.arch@bytedance.com: unlock the &shrinker_rwsem before the call to free_shrinker_info()] Link: https://lkml.kernel.org/r/20230928141517.12164-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20230911094444.68966-41-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 12 +-- include/linux/shrinker.h | 17 +++ mm/shrinker.c | 250 ++++++++++++++++++++++++++++----------------- 3 files changed, 172 insertions(+), 107 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4e24da16d2c..031102ac9311 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,7 @@ #include #include #include +#include struct mem_cgroup; struct obj_cgroup; @@ -88,17 +89,6 @@ struct mem_cgroup_reclaim_iter { unsigned int generation; }; -/* - * Bitmap and deferred work of shrinker::id corresponding to memcg-aware - * shrinkers, which have elements charged to this memcg. - */ -struct shrinker_info { - struct rcu_head rcu; - atomic_long_t *nr_deferred; - unsigned long *map; - int map_nr_max; -}; - struct lruvec_stats_percpu { /* Local (CPU and cgroup) state */ long state[NR_VM_NODE_STAT_ITEMS]; diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 1d3899f37229..ba5ac82b5dbd 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -5,6 +5,23 @@ #include #include +#define SHRINKER_UNIT_BITS BITS_PER_LONG + +/* + * Bitmap and deferred work of shrinker::id corresponding to memcg-aware + * shrinkers, which have elements charged to the memcg. + */ +struct shrinker_info_unit { + atomic_long_t nr_deferred[SHRINKER_UNIT_BITS]; + DECLARE_BITMAP(map, SHRINKER_UNIT_BITS); +}; + +struct shrinker_info { + struct rcu_head rcu; + int map_nr_max; + struct shrinker_info_unit *unit[]; +}; + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. diff --git a/mm/shrinker.c b/mm/shrinker.c index 736fa67e8454..e9644cda80b5 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -12,15 +12,50 @@ DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_MEMCG static int shrinker_nr_max; -/* The shrinker_info is expanded in a batch of BITS_PER_LONG */ -static inline int shrinker_map_size(int nr_items) +static inline int shrinker_unit_size(int nr_items) { - return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long)); + return (DIV_ROUND_UP(nr_items, SHRINKER_UNIT_BITS) * sizeof(struct shrinker_info_unit *)); } -static inline int shrinker_defer_size(int nr_items) +static inline void shrinker_unit_free(struct shrinker_info *info, int start) { - return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t)); + struct shrinker_info_unit **unit; + int nr, i; + + if (!info) + return; + + unit = info->unit; + nr = DIV_ROUND_UP(info->map_nr_max, SHRINKER_UNIT_BITS); + + for (i = start; i < nr; i++) { + if (!unit[i]) + break; + + kfree(unit[i]); + unit[i] = NULL; + } +} + +static inline int shrinker_unit_alloc(struct shrinker_info *new, + struct shrinker_info *old, int nid) +{ + struct shrinker_info_unit *unit; + int nr = DIV_ROUND_UP(new->map_nr_max, SHRINKER_UNIT_BITS); + int start = old ? DIV_ROUND_UP(old->map_nr_max, SHRINKER_UNIT_BITS) : 0; + int i; + + for (i = start; i < nr; i++) { + unit = kzalloc_node(sizeof(*unit), GFP_KERNEL, nid); + if (!unit) { + shrinker_unit_free(new, start); + return -ENOMEM; + } + + new->unit[i] = unit; + } + + return 0; } void free_shrinker_info(struct mem_cgroup *memcg) @@ -32,6 +67,7 @@ void free_shrinker_info(struct mem_cgroup *memcg) for_each_node(nid) { pn = memcg->nodeinfo[nid]; info = rcu_dereference_protected(pn->shrinker_info, true); + shrinker_unit_free(info, 0); kvfree(info); rcu_assign_pointer(pn->shrinker_info, NULL); } @@ -40,28 +76,28 @@ void free_shrinker_info(struct mem_cgroup *memcg) int alloc_shrinker_info(struct mem_cgroup *memcg) { struct shrinker_info *info; - int nid, size, ret = 0; - int map_size, defer_size = 0; + int nid, ret = 0; + int array_size = 0; down_write(&shrinker_rwsem); - map_size = shrinker_map_size(shrinker_nr_max); - defer_size = shrinker_defer_size(shrinker_nr_max); - size = map_size + defer_size; + array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { - info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid); - if (!info) { - free_shrinker_info(memcg); - ret = -ENOMEM; - break; - } - info->nr_deferred = (atomic_long_t *)(info + 1); - info->map = (void *)info->nr_deferred + defer_size; + info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); + if (!info) + goto err; info->map_nr_max = shrinker_nr_max; + if (shrinker_unit_alloc(info, NULL, nid)) + goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } up_write(&shrinker_rwsem); return ret; + +err: + up_write(&shrinker_rwsem); + free_shrinker_info(memcg); + return -ENOMEM; } static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, @@ -71,15 +107,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, lockdep_is_held(&shrinker_rwsem)); } -static int expand_one_shrinker_info(struct mem_cgroup *memcg, - int map_size, int defer_size, - int old_map_size, int old_defer_size, - int new_nr_max) +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, + int old_size, int new_nr_max) { struct shrinker_info *new, *old; struct mem_cgroup_per_node *pn; int nid; - int size = map_size + defer_size; for_each_node(nid) { pn = memcg->nodeinfo[nid]; @@ -92,21 +125,17 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, if (new_nr_max <= old->map_nr_max) continue; - new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid); + new = kvmalloc_node(sizeof(*new) + new_size, GFP_KERNEL, nid); if (!new) return -ENOMEM; - new->nr_deferred = (atomic_long_t *)(new + 1); - new->map = (void *)new->nr_deferred + defer_size; new->map_nr_max = new_nr_max; - /* map: set all old bits, clear all new bits */ - memset(new->map, (int)0xff, old_map_size); - memset((void *)new->map + old_map_size, 0, map_size - old_map_size); - /* nr_deferred: copy old values, clear all new values */ - memcpy(new->nr_deferred, old->nr_deferred, old_defer_size); - memset((void *)new->nr_deferred + old_defer_size, 0, - defer_size - old_defer_size); + memcpy(new->unit, old->unit, old_size); + if (shrinker_unit_alloc(new, old, nid)) { + kvfree(new); + return -ENOMEM; + } rcu_assign_pointer(pn->shrinker_info, new); kvfree_rcu(old, rcu); @@ -118,9 +147,8 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, static int expand_shrinker_info(int new_id) { int ret = 0; - int new_nr_max = round_up(new_id + 1, BITS_PER_LONG); - int map_size, defer_size = 0; - int old_map_size, old_defer_size = 0; + int new_nr_max = round_up(new_id + 1, SHRINKER_UNIT_BITS); + int new_size, old_size = 0; struct mem_cgroup *memcg; if (!root_mem_cgroup) @@ -128,15 +156,12 @@ static int expand_shrinker_info(int new_id) lockdep_assert_held(&shrinker_rwsem); - map_size = shrinker_map_size(new_nr_max); - defer_size = shrinker_defer_size(new_nr_max); - old_map_size = shrinker_map_size(shrinker_nr_max); - old_defer_size = shrinker_defer_size(shrinker_nr_max); + new_size = shrinker_unit_size(new_nr_max); + old_size = shrinker_unit_size(shrinker_nr_max); memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - ret = expand_one_shrinker_info(memcg, map_size, defer_size, - old_map_size, old_defer_size, + ret = expand_one_shrinker_info(memcg, new_size, old_size, new_nr_max); if (ret) { mem_cgroup_iter_break(NULL, memcg); @@ -150,17 +175,34 @@ out: return ret; } +static inline int shrinker_id_to_index(int shrinker_id) +{ + return shrinker_id / SHRINKER_UNIT_BITS; +} + +static inline int shrinker_id_to_offset(int shrinker_id) +{ + return shrinker_id % SHRINKER_UNIT_BITS; +} + +static inline int calc_shrinker_id(int index, int offset) +{ + return index * SHRINKER_UNIT_BITS + offset; +} + void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; + struct shrinker_info_unit *unit; rcu_read_lock(); info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + unit = info->unit[shrinker_id_to_index(shrinker_id)]; if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); - set_bit(shrinker_id, info->map); + set_bit(shrinker_id_to_offset(shrinker_id), unit->map); } rcu_read_unlock(); } @@ -209,26 +251,31 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + struct shrinker_info_unit *unit; info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); + unit = info->unit[shrinker_id_to_index(shrinker->id)]; + return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); } void reparent_shrinker_deferred(struct mem_cgroup *memcg) { - int i, nid; + int nid, index, offset; long nr; struct mem_cgroup *parent; struct shrinker_info *child_info, *parent_info; + struct shrinker_info_unit *child_unit, *parent_unit; parent = parent_mem_cgroup(memcg); if (!parent) @@ -239,9 +286,13 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); - for (i = 0; i < child_info->map_nr_max; i++) { - nr = atomic_long_read(&child_info->nr_deferred[i]); - atomic_long_add(nr, &parent_info->nr_deferred[i]); + for (index = 0; index < shrinker_id_to_index(child_info->map_nr_max); index++) { + child_unit = child_info->unit[index]; + parent_unit = parent_info->unit[index]; + for (offset = 0; offset < SHRINKER_UNIT_BITS; offset++) { + nr = atomic_long_read(&child_unit->nr_deferred[offset]); + atomic_long_add(nr, &parent_unit->nr_deferred[offset]); + } } } up_read(&shrinker_rwsem); @@ -407,7 +458,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int i; + int offset, index = 0; if (!mem_cgroup_online(memcg)) return 0; @@ -419,56 +470,63 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (unlikely(!info)) goto unlock; - for_each_set_bit(i, info->map, info->map_nr_max) { - struct shrink_control sc = { - .gfp_mask = gfp_mask, - .nid = nid, - .memcg = memcg, - }; - struct shrinker *shrinker; + for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + struct shrinker_info_unit *unit; - shrinker = idr_find(&shrinker_idr, i); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(i, info->map); - continue; - } + unit = info->unit[index]; - /* Call non-slab shrinkers even though kmem is disabled */ - if (!memcg_kmem_online() && - !(shrinker->flags & SHRINKER_NONSLAB)) - continue; + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; + struct shrinker *shrinker; + int shrinker_id = calc_shrinker_id(index, offset); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) { - clear_bit(i, info->map); - /* - * After the shrinker reported that it had no objects to - * free, but before we cleared the corresponding bit in - * the memcg shrinker map, a new object might have been - * added. To make sure, we have the bit set in this - * case, we invoke the shrinker one more time and reset - * the bit if it reports that it is not empty anymore. - * The memory barrier here pairs with the barrier in - * set_shrinker_bit(): - * - * list_lru_add() shrink_slab_memcg() - * list_add_tail() clear_bit() - * - * set_bit() do_shrink_slab() - */ - smp_mb__after_atomic(); - ret = do_shrink_slab(&sc, shrinker, priority); - if (ret == SHRINK_EMPTY) - ret = 0; - else - set_shrinker_bit(memcg, nid, i); - } - freed += ret; + shrinker = idr_find(&shrinker_idr, shrinker_id); + if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { + if (!shrinker) + clear_bit(offset, unit->map); + continue; + } - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; + /* Call non-slab shrinkers even though kmem is disabled */ + if (!memcg_kmem_online() && + !(shrinker->flags & SHRINKER_NONSLAB)) + continue; + + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) { + clear_bit(offset, unit->map); + /* + * After the shrinker reported that it had no objects to + * free, but before we cleared the corresponding bit in + * the memcg shrinker map, a new object might have been + * added. To make sure, we have the bit set in this + * case, we invoke the shrinker one more time and reset + * the bit if it reports that it is not empty anymore. + * The memory barrier here pairs with the barrier in + * set_shrinker_bit(): + * + * list_lru_add() shrink_slab_memcg() + * list_add_tail() clear_bit() + * + * set_bit() do_shrink_slab() + */ + smp_mb__after_atomic(); + ret = do_shrink_slab(&sc, shrinker, priority); + if (ret == SHRINK_EMPTY) + ret = 0; + else + set_shrinker_bit(memcg, nid, shrinker_id); + } + freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + goto unlock; + } } } unlock: -- cgit v1.2.3 From 48a7a0996a0089f4161ad437a810a10596e8d278 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:40 +0800 Subject: mm: shrinker: rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}() With the new shrinker APIs, there is no action such as prealloc, so rename {prealloc|unregister}_memcg_shrinker() to shrinker_memcg_{alloc|remove}(), which corresponds to the idr_{alloc|remove}() inside the function. Link: https://lkml.kernel.org/r/20230911094444.68966-42-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index e9644cda80b5..e2e832b15803 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -210,7 +210,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) static DEFINE_IDR(shrinker_idr); -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { int id, ret = -ENOMEM; @@ -236,7 +236,7 @@ unlock: return ret; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { int id = shrinker->id; @@ -298,12 +298,12 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) up_read(&shrinker_rwsem); } #else -static int prealloc_memcg_shrinker(struct shrinker *shrinker) +static int shrinker_memcg_alloc(struct shrinker *shrinker) { return -ENOSYS; } -static void unregister_memcg_shrinker(struct shrinker *shrinker) +static void shrinker_memcg_remove(struct shrinker *shrinker) { } @@ -629,7 +629,7 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...) shrinker->seeks = DEFAULT_SEEKS; if (flags & SHRINKER_MEMCG_AWARE) { - err = prealloc_memcg_shrinker(shrinker); + err = shrinker_memcg_alloc(shrinker); if (err == -ENOSYS) { /* Memcg is not supported, fallback to non-memcg-aware shrinker. */ shrinker->flags &= ~SHRINKER_MEMCG_AWARE; @@ -701,7 +701,7 @@ void shrinker_free(struct shrinker *shrinker) shrinker_debugfs_name_free(shrinker); if (shrinker->flags & SHRINKER_MEMCG_AWARE) - unregister_memcg_shrinker(shrinker); + shrinker_memcg_remove(shrinker); up_write(&shrinker_rwsem); if (debugfs_entry) -- cgit v1.2.3 From ca1d36b823944f24b5755311e95883fb5fdb807b Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:41 +0800 Subject: mm: shrinker: make global slab shrink lockless The shrinker_rwsem is a global read-write lock in shrinkers subsystem, which protects most operations such as slab shrink, registration and unregistration of shrinkers, etc. This can easily cause problems in the following cases. 1) When the memory pressure is high and there are many filesystems mounted or unmounted at the same time, slab shrink will be affected (down_read_trylock() failed). Such as the real workload mentioned by Kirill Tkhai: ``` One of the real workloads from my experience is start of an overcommitted node containing many starting containers after node crash (or many resuming containers after reboot for kernel update). In these cases memory pressure is huge, and the node goes round in long reclaim. ``` 2) If a shrinker is blocked (such as the case mentioned in [1]) and a writer comes in (such as mount a fs), then this writer will be blocked and cause all subsequent shrinker-related operations to be blocked. Even if there is no competitor when shrinking slab, there may still be a problem. The down_read_trylock() may become a perf hotspot with frequent calls to shrink_slab(). Because of the poor multicore scalability of atomic operations, this can lead to a significant drop in IPC (instructions per cycle). We used to implement the lockless slab shrink with SRCU [2], but then kernel test robot reported -88.8% regression in stress-ng.ramfs.ops_per_sec test case [3], so we reverted it [4]. This commit uses the refcount+RCU method [5] proposed by Dave Chinner to re-implement the lockless global slab shrink. The memcg slab shrink is handled in the subsequent patch. For now, all shrinker instances are converted to dynamically allocated and will be freed by call_rcu(). So we can use rcu_read_{lock,unlock}() to ensure that the shrinker instance is valid. And the shrinker instance will not be run again after unregistration. So the structure that records the pointer of shrinker instance can be safely freed without waiting for the RCU read-side critical section. In this way, while we implement the lockless slab shrink, we don't need to be blocked in unregister_shrinker(). The following are the test results: stress-ng --timeout 60 --times --verify --metrics-brief --ramfs 9 & 1) Before applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 473062 60.00 8.00 279.13 7884.12 1647.59 for a 60.01s run time: 1440.34s available CPU time 7.99s user time ( 0.55%) 279.13s system time ( 19.38%) 287.12s total time ( 19.93%) load average: 7.12 2.99 1.15 successful run completed in 60.01s (1 min, 0.01 secs) 2) After applying this patchset: setting to a 60 second run per stressor dispatching hogs: 9 ramfs stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) ramfs 477165 60.00 8.13 281.34 7952.55 1648.40 for a 60.01s run time: 1440.33s available CPU time 8.12s user time ( 0.56%) 281.34s system time ( 19.53%) 289.46s total time ( 20.10%) load average: 6.98 3.03 1.19 successful run completed in 60.01s (1 min, 0.01 secs) We can see that the ops/s has hardly changed. [1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/ [2]. https://lore.kernel.org/lkml/20230313112819.38938-1-zhengqi.arch@bytedance.com/ [3]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [4]. https://lore.kernel.org/all/20230609081518.3039120-1-qi.zheng@linux.dev/ [5]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230911094444.68966-43-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 24 +++++++++++++ mm/shrinker.c | 89 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 21 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index ba5ac82b5dbd..e4f93120e0ab 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -4,6 +4,8 @@ #include #include +#include +#include #define SHRINKER_UNIT_BITS BITS_PER_LONG @@ -87,6 +89,17 @@ struct shrinker { int seeks; /* seeks to recreate an obj */ unsigned flags; + /* + * The reference count of this shrinker. Registered shrinker have an + * initial refcount of 1, then the lookup operations are now allowed + * to use it via shrinker_try_get(). Later in the unregistration step, + * the initial refcount will be discarded, and will free the shrinker + * asynchronously via RCU after its refcount reaches 0. + */ + refcount_t refcount; + struct completion done; /* use to wait for refcount to reach 0 */ + struct rcu_head rcu; + void *private_data; /* These are for internal use */ @@ -122,6 +135,17 @@ struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); +static inline bool shrinker_try_get(struct shrinker *shrinker) +{ + return refcount_inc_not_zero(&shrinker->refcount); +} + +static inline void shrinker_put(struct shrinker *shrinker) +{ + if (refcount_dec_and_test(&shrinker->refcount)) + complete(&shrinker->done); +} + #ifdef CONFIG_SHRINKER_DEBUG extern int __printf(2, 3) shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...); diff --git a/mm/shrinker.c b/mm/shrinker.c index e2e832b15803..9bb2e61092b3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include "internal.h" @@ -577,33 +578,50 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - if (!down_read_trylock(&shrinker_rwsem)) - goto out; - - list_for_each_entry(shrinker, &shrinker_list, list) { + /* + * lockless algorithm of global shrink. + * + * In the unregistration setp, the shrinker will be freed asynchronously + * via RCU after its refcount reaches 0. So both rcu_read_lock() and + * shrinker_try_get() can be used to ensure the existence of the shrinker. + * + * So in the global shrink: + * step 1: use rcu_read_lock() to guarantee existence of the shrinker + * and the validity of the shrinker_list walk. + * step 2: use shrinker_try_get() to try get the refcount, if successful, + * then the existence of the shrinker can also be guaranteed, + * so we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 3: *MUST* to reacquire the RCU lock before calling shrinker_put(), + * which ensures that neither this shrinker nor the next shrinker + * will be freed in the next traversal operation. + * step 4: do shrinker_put() paired with step 2 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + */ + rcu_read_lock(); + list_for_each_entry_rcu(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; + if (!shrinker_try_get(shrinker)) + continue; + + rcu_read_unlock(); + ret = do_shrink_slab(&sc, shrinker, priority); if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - /* - * Bail out if someone want to register a new shrinker to - * prevent the registration from being stalled for long periods - * by parallel ongoing shrinking. - */ - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } + + rcu_read_lock(); + shrinker_put(shrinker); } - up_read(&shrinker_rwsem); -out: + rcu_read_unlock(); cond_resched(); return freed; } @@ -676,13 +694,29 @@ void shrinker_register(struct shrinker *shrinker) } down_write(&shrinker_rwsem); - list_add_tail(&shrinker->list, &shrinker_list); + list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); + + init_completion(&shrinker->done); + /* + * Now the shrinker is fully set up, take the first reference to it to + * indicate that lookup operations are now allowed to use it via + * shrinker_try_get(). + */ + refcount_set(&shrinker->refcount, 1); } EXPORT_SYMBOL_GPL(shrinker_register); +static void shrinker_free_rcu_cb(struct rcu_head *head) +{ + struct shrinker *shrinker = container_of(head, struct shrinker, rcu); + + kfree(shrinker->nr_deferred); + kfree(shrinker); +} + void shrinker_free(struct shrinker *shrinker) { struct dentry *debugfs_entry = NULL; @@ -691,9 +725,25 @@ void shrinker_free(struct shrinker *shrinker) if (!shrinker) return; + if (shrinker->flags & SHRINKER_REGISTERED) { + /* drop the initial refcount */ + shrinker_put(shrinker); + /* + * Wait for all lookups of the shrinker to complete, after that, + * no shrinker is running or will run again, then we can safely + * free it asynchronously via RCU and safely free the structure + * where the shrinker is located, such as super_block etc. + */ + wait_for_completion(&shrinker->done); + } + down_write(&shrinker_rwsem); if (shrinker->flags & SHRINKER_REGISTERED) { - list_del(&shrinker->list); + /* + * Now we can safely remove it from the shrinker_list and then + * free it. + */ + list_del_rcu(&shrinker->list); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); shrinker->flags &= ~SHRINKER_REGISTERED; } @@ -707,9 +757,6 @@ void shrinker_free(struct shrinker *shrinker) if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; - - kfree(shrinker); + call_rcu(&shrinker->rcu, shrinker_free_rcu_cb); } EXPORT_SYMBOL_GPL(shrinker_free); -- cgit v1.2.3 From 50d09da8e1199092a128eebd8a986d1fb0335fe4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:42 +0800 Subject: mm: shrinker: make memcg slab shrink lockless Like global slab shrink, this commit also uses refcount+RCU method to make memcg slab shrink lockless. Use the following script to do slab shrink stress test: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 1) Before applying this patchset: 33.15% [kernel] [k] down_read_trylock 25.38% [kernel] [k] shrink_slab 21.75% [kernel] [k] up_read 4.45% [kernel] [k] _find_next_bit 2.27% [kernel] [k] do_shrink_slab 1.80% [kernel] [k] intel_idle_irq 1.79% [kernel] [k] shrink_lruvec 0.67% [kernel] [k] xas_descend 0.41% [kernel] [k] mem_cgroup_iter 0.40% [kernel] [k] shrink_node 0.38% [kernel] [k] list_lru_count_one 2) After applying this patchset: 64.56% [kernel] [k] shrink_slab 12.18% [kernel] [k] do_shrink_slab 3.30% [kernel] [k] __rcu_read_unlock 2.61% [kernel] [k] shrink_lruvec 2.49% [kernel] [k] __rcu_read_lock 1.93% [kernel] [k] intel_idle_irq 0.89% [kernel] [k] shrink_node 0.81% [kernel] [k] mem_cgroup_iter 0.77% [kernel] [k] mem_cgroup_calculate_protection 0.66% [kernel] [k] list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. Link: https://lkml.kernel.org/r/20230911094444.68966-44-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 19 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index 9bb2e61092b3..d1b34a79c7a7 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -219,7 +219,6 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -253,10 +252,15 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + nr_deferred = atomic_long_xchg(&unit->nr_deferred[shrinker_id_to_offset(shrinker->id)], 0); + rcu_read_unlock(); + + return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, @@ -264,10 +268,16 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; struct shrinker_info_unit *unit; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); unit = info->unit[shrinker_id_to_index(shrinker->id)]; - return atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + nr_deferred = + atomic_long_add_return(nr, &unit->nr_deferred[shrinker_id_to_offset(shrinker->id)]); + rcu_read_unlock(); + + return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) @@ -464,18 +474,54 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); + /* + * lockless algorithm of memcg shrink. + * + * The shrinker_info may be freed asynchronously via RCU in the + * expand_one_shrinker_info(), so the rcu_read_lock() needs to be used + * to ensure the existence of the shrinker_info. + * + * The shrinker_info_unit is never freed unless its corresponding memcg + * is destroyed. Here we already hold the refcount of memcg, so the + * memcg will not be destroyed, and of course shrinker_info_unit will + * not be freed. + * + * So in the memcg shrink: + * step 1: use rcu_read_lock() to guarantee existence of the + * shrinker_info. + * step 2: after getting shrinker_info_unit we can safely release the + * RCU lock. + * step 3: traverse the bitmap and calculate shrinker_id + * step 4: use rcu_read_lock() to guarantee existence of the shrinker. + * step 5: use shrinker_id to find the shrinker, then use + * shrinker_try_get() to guarantee existence of the shrinker, + * then we can release the RCU lock to do do_shrink_slab() that + * may sleep. + * step 6: do shrinker_put() paired with step 5 to put the refcount, + * if the refcount reaches 0, then wake up the waiter in + * shrinker_free() by calling complete(). + * Note: here is different from the global shrink, we don't + * need to acquire the RCU lock to guarantee existence of + * the shrinker, because we don't need to use this + * shrinker to traverse the next shrinker in the bitmap. + * step 7: we have already exited the read-side of rcu critical section + * before calling do_shrink_slab(), the shrinker_info may be + * released in expand_one_shrinker_info(), so go back to step 1 + * to reacquire the shrinker_info. + */ +again: + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (unlikely(!info)) goto unlock; - for (; index < shrinker_id_to_index(info->map_nr_max); index++) { + if (index < shrinker_id_to_index(info->map_nr_max)) { struct shrinker_info_unit *unit; unit = info->unit[index]; + rcu_read_unlock(); + for_each_set_bit(offset, unit->map, SHRINKER_UNIT_BITS) { struct shrink_control sc = { .gfp_mask = gfp_mask, @@ -485,12 +531,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, struct shrinker *shrinker; int shrinker_id = calc_shrinker_id(index, offset); + rcu_read_lock(); shrinker = idr_find(&shrinker_idr, shrinker_id); - if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) { - if (!shrinker) - clear_bit(offset, unit->map); + if (unlikely(!shrinker || !shrinker_try_get(shrinker))) { + clear_bit(offset, unit->map); + rcu_read_unlock(); continue; } + rcu_read_unlock(); /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && @@ -523,15 +571,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, shrinker_id); } freed += ret; - - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - goto unlock; - } + shrinker_put(shrinker); } + + index++; + goto again; } unlock: - up_read(&shrinker_rwsem); + rcu_read_unlock(); return freed; } #else /* !CONFIG_MEMCG */ -- cgit v1.2.3 From 604b8b655020816b66ae474681d1100c5c0ba8c4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:43 +0800 Subject: mm: shrinker: hold write lock to reparent shrinker nr_deferred For now, reparent_shrinker_deferred() is the only holder of read lock of shrinker_rwsem. And it already holds the global cgroup_mutex, so it will not be called in parallel. Therefore, in order to convert shrinker_rwsem to shrinker_mutex later, here we change to hold the write lock of shrinker_rwsem to reparent. Link: https://lkml.kernel.org/r/20230911094444.68966-45-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- mm/shrinker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shrinker.c b/mm/shrinker.c index d1b34a79c7a7..ef22f20a97cb 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_read(&shrinker_rwsem); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_read(&shrinker_rwsem); + up_write(&shrinker_rwsem); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) -- cgit v1.2.3 From 8a0e8bb112af28362514624fc46e20426b4bdf1f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:44 +0800 Subject: mm: shrinker: convert shrinker_rwsem to mutex Now there are no readers of shrinker_rwsem, so we can simply replace it with mutex lock. [akpm@linux-foundation.org: update the fix to alloc_shrinker_info()] Link: https://lkml.kernel.org/r/20230911094444.68966-46-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/dm-cache-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker.c | 30 +++++++++++++++--------------- mm/shrinker_debug.c | 14 +++++++------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index acffed750e3e..9e0c69958587 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_rwsem without holding cmd->root_lock + * - must take shrinker_mutex without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index adadf6689611..816a22a5cad1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag) * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker.c b/mm/shrinker.c index ef22f20a97cb..dd91eab43ed3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -8,7 +8,7 @@ #include "internal.h" LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); +DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -80,7 +80,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, ret = 0; int array_size = 0; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); @@ -91,12 +91,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; err: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } @@ -105,7 +105,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, @@ -155,7 +155,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); @@ -218,7 +218,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -232,7 +232,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } @@ -242,7 +242,7 @@ static void shrinker_memcg_remove(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) @@ -740,11 +740,11 @@ void shrinker_register(struct shrinker *shrinker) return; } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* @@ -784,7 +784,7 @@ void shrinker_free(struct shrinker *shrinker) wait_for_completion(&shrinker->done); } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then @@ -799,7 +799,7 @@ void shrinker_free(struct shrinker *shrinker) if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 24aebe7c24cc..12ea5486a3e9 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -9,7 +9,7 @@ #include "internal.h" /* defined in vmscan.c */ -extern struct rw_semaphore shrinker_rwsem; +extern struct mutex shrinker_mutex; extern struct list_head shrinker_list; static DEFINE_IDA(shrinker_debugfs_ida); @@ -165,7 +165,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -208,7 +208,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); old = shrinker->name; shrinker->name = new; @@ -226,7 +226,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); kfree_const(old); @@ -239,7 +239,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; @@ -265,14 +265,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } -- cgit v1.2.3 From ed547ab6f4041c2186df672029eb17f98f44d125 Mon Sep 17 00:00:00 2001 From: liujinlong Date: Tue, 12 Sep 2023 16:59:23 +0800 Subject: mm: vmscan: modify an easily misunderstood function name When looking at the code in the memory part, I found that the purpose of the function prepare_scan_countis very different from the function name. It is easy to misunderstand when reading.The function prepare_scan_count mainly completes the assignment of the scan_control structure.Therefore, I suggest that the function name can be changed to prepare_scan_control, which is easier to understand. Link: https://lkml.kernel.org/r/20230912085923.27238-1-liujinlong@kylinos.cn Signed-off-by: liujinlong Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4a6551510b65..8a3f83e0231e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2208,7 +2208,7 @@ enum scan_balance { SCAN_FILE, }; -static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) +static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; struct lruvec *target_lruvec; @@ -5834,7 +5834,7 @@ again: nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; - prepare_scan_count(pgdat, sc); + prepare_scan_control(pgdat, sc); shrink_node_memcgs(pgdat, sc); -- cgit v1.2.3 From 811244a501b967b00fecb1ae906d5dc6329c91e0 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 14 Sep 2023 00:49:37 +0800 Subject: mm: memcg: add THP swap out info for anonymous reclaim At present, we support per-memcg reclaim strategy, however we do not know the number of transparent huge pages being reclaimed, as we know the transparent huge pages need to be splited before reclaim them, and they will bring some performance bottleneck effect. for example, when two memcg (A & B) are doing reclaim for anonymous pages at same time, and 'A' memcg is reclaiming a large number of transparent huge pages, we can better analyze that the performance bottleneck will be caused by 'A' memcg. therefore, in order to better analyze such problems, there add THP swap out info for per-memcg. [akpm@linux-foundation.orgL fix swap_writepage_fs(), per Johannes] Link: https://lkml.kernel.org/r/20230913213343.GB48476@cmpxchg.org Link: https://lkml.kernel.org/r/20230913164938.16918-1-vernhao@tencent.com Signed-off-by: Xin Hao Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Roman Gushchin Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 9 +++++++++ mm/memcontrol.c | 2 ++ mm/page_io.c | 8 ++++---- mm/vmscan.c | 1 + 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b26b5274eaaf..622a7f28db1f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1532,6 +1532,15 @@ PAGE_SIZE multiple when read back. collapsing an existing range of pages. This counter is not present when CONFIG_TRANSPARENT_HUGEPAGE is not set. + thp_swpout (npn) + Number of transparent hugepages which are swapout in one piece + without splitting. + + thp_swpout_fallback (npn) + Number of transparent hugepages which were split before swapout. + Usually because failed to allocate some continuous swap space + for the huge page. + memory.numa_stat A read-only nested-keyed file which exists on non-root cgroups. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b009b233ab8..68313331971c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -704,6 +704,8 @@ static const unsigned int memcg_vm_event_stat[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE THP_FAULT_ALLOC, THP_COLLAPSE_ALLOC, + THP_SWPOUT, + THP_SWPOUT_FALLBACK, #endif }; diff --git a/mm/page_io.c b/mm/page_io.c index fe4c21af23f2..cb559ae324c6 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -208,8 +208,10 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (unlikely(folio_test_pmd_mappable(folio))) + if (unlikely(folio_test_pmd_mappable(folio))) { + count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); + } #endif count_vm_events(PSWPOUT, folio_nr_pages(folio)); } @@ -278,9 +280,6 @@ static void sio_write_complete(struct kiocb *iocb, long ret) set_page_dirty(page); ClearPageReclaim(page); } - } else { - for (p = 0; p < sio->pages; p++) - count_swpout_vm_event(page_folio(sio->bvec[p].bv_page)); } for (p = 0; p < sio->pages; p++) @@ -296,6 +295,7 @@ static void swap_writepage_fs(struct page *page, struct writeback_control *wbc) struct file *swap_file = sis->swap_file; loff_t pos = page_file_offset(page); + count_swpout_vm_event(page_folio(page)); set_page_writeback(page); unlock_page(page); if (wbc->swap_plug) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8a3f83e0231e..acf115468bf8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1214,6 +1214,7 @@ retry: folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE + count_memcg_folio_events(folio, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); #endif if (!add_to_swap(folio)) -- cgit v1.2.3 From fd63908706f79c963946a77b7f352db5431deed5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:08 +0200 Subject: mm/rmap: drop stale comment in page_add_anon_rmap and hugepage_add_anon_rmap() Patch series "Anon rmap cleanups". Some cleanups around rmap for anon pages. I'm working on more cleanups also around file rmap -- also to handle the "compound" parameter internally only and to let hugetlb use page_add_file_rmap(), but these changes make sense separately. This patch (of 6): That comment was added in commit 5dbe0af47f8a ("mm: fix kernel BUG at mm/rmap.c:1017!") to document why we can see vma->vm_end getting adjusted concurrently due to a VMA split. However, the optimized locking code was changed again in bf181b9f9d8 ("mm anon rmap: replace same_anon_vma linked list with an interval tree."). ... and later, the comment was changed in commit 0503ea8f5ba7 ("mm/mmap: remove __vma_adjust()") to talk about "vma_merge" although the original issue was with VMA splitting. Let's just remove that comment. Nowadays, it's outdated, imprecise and confusing. Link: https://lkml.kernel.org/r/20230913125113.313322-1-david@redhat.com Link: https://lkml.kernel.org/r/20230913125113.313322-2-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/rmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 9f795b93cf40..61ad50b9ed9f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1245,7 +1245,6 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); if (likely(!folio_test_ksm(folio))) { - /* address might be in next vma when migration races vma_merge */ if (first) __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); @@ -2549,7 +2548,6 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, BUG_ON(!folio_test_locked(folio)); BUG_ON(!anon_vma); - /* address might be in next vma when migration races vma_merge */ first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); -- cgit v1.2.3 From c66db8c0702c0ab741ecfd5e12b323ff49fe9089 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:09 +0200 Subject: mm/rmap: move SetPageAnonExclusive out of __page_set_anon_rmap() Let's handle it in the caller. No need to pass the page. While at it, rename the function to __folio_set_anon() and pass "bool exclusive" instead of "int exclusive". Link: https://lkml.kernel.org/r/20230913125113.313322-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 61ad50b9ed9f..f76f0f23725c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1122,27 +1122,25 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) } /** - * __page_set_anon_rmap - set up new anonymous rmap - * @folio: Folio which contains page. - * @page: Page to add to rmap. - * @vma: VM area to add page to. + * __folio_set_anon - set up a new anonymous rmap for a folio + * @folio: The folio to set up the new anonymous rmap for. + * @vma: VM area to add the folio to. * @address: User virtual address of the mapping - * @exclusive: the page is exclusively owned by the current process + * @exclusive: Whether the folio is exclusive to the process. */ -static void __page_set_anon_rmap(struct folio *folio, struct page *page, - struct vm_area_struct *vma, unsigned long address, int exclusive) +static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, + unsigned long address, bool exclusive) { struct anon_vma *anon_vma = vma->anon_vma; BUG_ON(!anon_vma); if (folio_test_anon(folio)) - goto out; + return; /* - * If the page isn't exclusively mapped into this vma, - * we must use the _oldest_ possible anon_vma for the - * page mapping! + * If the folio isn't exclusive to this vma, we must use the _oldest_ + * possible anon_vma for the folio mapping! */ if (!exclusive) anon_vma = anon_vma->root; @@ -1156,9 +1154,6 @@ static void __page_set_anon_rmap(struct folio *folio, struct page *page, anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma); folio->index = linear_page_index(vma, address); -out: - if (exclusive) - SetPageAnonExclusive(page); } /** @@ -1246,11 +1241,13 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (likely(!folio_test_ksm(folio))) { if (first) - __page_set_anon_rmap(folio, page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); else __page_check_anon_rmap(folio, page, vma, address); } + if (flags & RMAP_EXCLUSIVE) + SetPageAnonExclusive(page); mlock_vma_folio(folio, vma, compound); } @@ -1289,7 +1286,8 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - __page_set_anon_rmap(folio, &folio->page, vma, address, 1); + __folio_set_anon(folio, vma, address, true); + SetPageAnonExclusive(&folio->page); } /** @@ -2552,8 +2550,10 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); if (first) - __page_set_anon_rmap(folio, page, vma, address, - !!(flags & RMAP_EXCLUSIVE)); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + if (flags & RMAP_EXCLUSIVE) + SetPageAnonExclusive(page); } void hugepage_add_new_anon_rmap(struct folio *folio, @@ -2563,6 +2563,7 @@ void hugepage_add_new_anon_rmap(struct folio *folio, /* increment count (starts at -1) */ atomic_set(&folio->_entire_mapcount, 0); folio_clear_hugetlb_restore_reserve(folio); - __page_set_anon_rmap(folio, &folio->page, vma, address, 1); + __folio_set_anon(folio, vma, address, true); + SetPageAnonExclusive(&folio->page); } #endif /* CONFIG_HUGETLB_PAGE */ -- cgit v1.2.3 From c5c540034747dfe450f64d1151081a6080daa8f9 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:10 +0200 Subject: mm/rmap: move folio_test_anon() check out of __folio_set_anon() Let's handle it in the caller; no need for the "first" check based on the mapcount. We really only end up with !anon pages in page_add_anon_rmap() via do_swap_page(), where we hold the folio lock. So races are not possible. Add a VM_WARN_ON_FOLIO() to make sure that we really hold the folio lock. In the future, we might want to let do_swap_page() use folio_add_new_anon_rmap() on new pages instead: however, we might have to pass then whether the folio is exclusive or not. So keep it in there for now. For hugetlb we never expect to have a non-anon page in hugepage_add_anon_rmap(). Remove that code, along with some other checks that are either not required or were checked in hugepage_add_new_anon_rmap() already. Link: https://lkml.kernel.org/r/20230913125113.313322-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index f76f0f23725c..65de3832123e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1135,9 +1135,6 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, BUG_ON(!anon_vma); - if (folio_test_anon(folio)) - return; - /* * If the folio isn't exclusive to this vma, we must use the _oldest_ * possible anon_vma for the folio mapping! @@ -1239,12 +1236,12 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (nr) __lruvec_stat_mod_folio(folio, NR_ANON_MAPPED, nr); - if (likely(!folio_test_ksm(folio))) { - if (first) - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); - else - __page_check_anon_rmap(folio, page, vma, address); + if (unlikely(!folio_test_anon(folio))) { + VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + __folio_set_anon(folio, vma, address, + !!(flags & RMAP_EXCLUSIVE)); + } else if (likely(!folio_test_ksm(folio))) { + __page_check_anon_rmap(folio, page, vma, address); } if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); @@ -2541,17 +2538,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); - struct anon_vma *anon_vma = vma->anon_vma; int first; - BUG_ON(!folio_test_locked(folio)); - BUG_ON(!anon_vma); + VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); + first = atomic_inc_and_test(&folio->_entire_mapcount); VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) - __folio_set_anon(folio, vma, address, - !!(flags & RMAP_EXCLUSIVE)); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); } -- cgit v1.2.3 From a1f34ee1de2c3a55bc2a6b9a38e1ecd2830dcc03 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:11 +0200 Subject: mm/rmap: warn on new PTE-mapped folios in page_add_anon_rmap() If swapin code would ever decide to not use order-0 pages and supply a PTE-mapped large folio, we will have to change how we call __folio_set_anon() -- eventually with exclusive=false and an adjusted address. For now, let's add a VM_WARN_ON_FOLIO() with a comment about the situation. Link: https://lkml.kernel.org/r/20230913125113.313322-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/rmap.c b/mm/rmap.c index 65de3832123e..9b40c3feba3e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1238,6 +1238,13 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, if (unlikely(!folio_test_anon(folio))) { VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio); + /* + * For a PTE-mapped large folio, we only know that the single + * PTE is exclusive. Further, __folio_set_anon() might not get + * folio->index right when not given the address of the head + * page. + */ + VM_WARN_ON_FOLIO(folio_test_large(folio) && !compound, folio); __folio_set_anon(folio, vma, address, !!(flags & RMAP_EXCLUSIVE)); } else if (likely(!folio_test_ksm(folio))) { -- cgit v1.2.3 From 132b180f06a74ddfc526709928036db3b7a1cf6d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:12 +0200 Subject: mm/rmap: simplify PageAnonExclusive sanity checks when adding anon rmap Let's sanity-check PageAnonExclusive vs. mapcount in page_add_anon_rmap() and hugepage_add_anon_rmap() after setting PageAnonExclusive simply by re-reading the mapcounts. We can stop initializing the "first" variable in page_add_anon_rmap() and no longer need an atomic_inc_and_test() in hugepage_add_anon_rmap(). While at it, switch to VM_WARN_ON_FOLIO(). [david@redhat.com: update check for doubly-mapped page] Link: https://lkml.kernel.org/r/d8e5a093-2e22-c14b-7e64-6da280398d9f@redhat.com Link: https://lkml.kernel.org/r/20230913125113.313322-6-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/rmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 9b40c3feba3e..ed4b602bcbd5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1199,7 +1199,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, atomic_t *mapped = &folio->_nr_pages_mapped; int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first = true; + bool first; /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { @@ -1228,9 +1228,6 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } } - VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); - VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (nr_pmdmapped) __lruvec_stat_mod_folio(folio, NR_ANON_THPS, nr_pmdmapped); if (nr) @@ -1252,6 +1249,10 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, } if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); + /* While PTE-mapping a THP we have a PMD and a PTE mapping. */ + VM_WARN_ON_FOLIO((atomic_read(&page->_mapcount) > 0 || + (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && + PageAnonExclusive(page), folio); mlock_vma_folio(folio, vma, compound); } @@ -2545,15 +2546,14 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { struct folio *folio = page_folio(page); - int first; VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); - first = atomic_inc_and_test(&folio->_entire_mapcount); - VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); - VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); + atomic_inc(&folio->_entire_mapcount); if (flags & RMAP_EXCLUSIVE) SetPageAnonExclusive(page); + VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && + PageAnonExclusive(page), folio); } void hugepage_add_new_anon_rmap(struct folio *folio, -- cgit v1.2.3 From 09c550508a4b8f7844b197cc16877dd0f7c42d8f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 13 Sep 2023 14:51:13 +0200 Subject: mm/rmap: pass folio to hugepage_add_anon_rmap() Let's pass a folio; we are always mapping the entire thing. Link: https://lkml.kernel.org/r/20230913125113.313322-7-david@redhat.com Signed-off-by: David Hildenbrand Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/migrate.c | 2 +- mm/rmap.c | 8 +++----- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 51cc21ebb568..d22f4d21a11c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -203,7 +203,7 @@ void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); -void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, +void hugepage_add_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address, rmap_t flags); void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); diff --git a/mm/migrate.c b/mm/migrate.c index 2053b54556ca..eb6bc4053bc4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -249,7 +249,7 @@ static bool remove_migration_pte(struct folio *folio, pte = arch_make_huge_pte(pte, shift, vma->vm_flags); if (folio_test_anon(folio)) - hugepage_add_anon_rmap(new, vma, pvmw.address, + hugepage_add_anon_rmap(folio, vma, pvmw.address, rmap_flags); else page_dup_file_rmap(new, true); diff --git a/mm/rmap.c b/mm/rmap.c index ed4b602bcbd5..d24e2c36372e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2542,18 +2542,16 @@ void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc) * * RMAP_COMPOUND is ignored. */ -void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma, +void hugepage_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { - struct folio *folio = page_folio(page); - VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); atomic_inc(&folio->_entire_mapcount); if (flags & RMAP_EXCLUSIVE) - SetPageAnonExclusive(page); + SetPageAnonExclusive(&folio->page); VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 && - PageAnonExclusive(page), folio); + PageAnonExclusive(&folio->page), folio); } void hugepage_add_new_anon_rmap(struct folio *folio, -- cgit v1.2.3 From a8ac4a767dcd9d87d8229045904d9fe15ea5e0e8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:24 +0800 Subject: mm: migrate: remove PageTransHuge check in numamigrate_isolate_page() Patch series "mm: migrate: more folio conversion and unification", v3. Convert more migrate functions to use a folio, it is also a preparation for large folio migration support when balancing numa. This patch (of 8): The assert VM_BUG_ON_PAGE(order && !PageTransHuge(page), page) is not very useful, 1) for a tail/base page, order = 0, for a head page, the order > 0 && PageTransHuge() is true 2) there is a PageCompound() check and only base page is handled in do_numa_page(), and do_huge_pmd_numa_page() only handle PMD-mapped THP 3) even though the page is a tail page, isolate_lru_page() will post a warning, and fail to isolate the page 4) if large folio/pte-mapped THP migration supported in the future, we could migrate the entire folio if numa fault on a tail page so just remove the check. Link: https://lkml.kernel.org/r/20230913095131.2426871-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230913095131.2426871-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Huang Ying Cc: Hugh Dickins Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index eb6bc4053bc4..d1b8ab5a2417 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2486,8 +2486,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int nr_pages = thp_nr_pages(page); int order = compound_order(page); - VM_BUG_ON_PAGE(order && !PageTransHuge(page), page); - /* Do not migrate THP mapped by multiple processes */ if (PageTransHuge(page) && total_mapcount(page) > 1) return 0; -- cgit v1.2.3 From 728be28fae8c838d52c91dce4867133798146357 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:25 +0800 Subject: mm: migrate: remove THP mapcount check in numamigrate_isolate_page() The check of THP mapped by multiple processes was introduced by commit 04fa5d6a6547 ("mm: migrate: check page_count of THP before migrating") and refactor by commit 340ef3902cf2 ("mm: numa: cleanup flow of transhuge page migration"), which is out of date, since migrate_misplaced_page() is now using the standard migrate_pages() for small pages and THPs, the reference count checking is in folio_migrate_mapping(), so let's remove the special check for THP. Link: https://lkml.kernel.org/r/20230913095131.2426871-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Cc: David Hildenbrand Cc: Hugh Dickins Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index d1b8ab5a2417..0030703204d3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2486,10 +2486,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) int nr_pages = thp_nr_pages(page); int order = compound_order(page); - /* Do not migrate THP mapped by multiple processes */ - if (PageTransHuge(page) && total_mapcount(page) > 1) - return 0; - /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { int z; -- cgit v1.2.3 From 2ac9e99f3b21b2864305fbfba4bae5913274c409 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:26 +0800 Subject: mm: migrate: convert numamigrate_isolate_page() to numamigrate_isolate_folio() Rename numamigrate_isolate_page() to numamigrate_isolate_folio(), then make it takes a folio and use folio API to save compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 0030703204d3..1f1aebe8da18 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2481,10 +2481,9 @@ static struct folio *alloc_misplaced_dst_folio(struct folio *src, return __folio_alloc_node(gfp, order, nid); } -static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) +static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio) { - int nr_pages = thp_nr_pages(page); - int order = compound_order(page); + int nr_pages = folio_nr_pages(folio); /* Avoid migrating to a node that is nearly full */ if (!migrate_balanced_pgdat(pgdat, nr_pages)) { @@ -2496,22 +2495,23 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (managed_zone(pgdat->node_zones + z)) break; } - wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); + wakeup_kswapd(pgdat->node_zones + z, 0, + folio_order(folio), ZONE_MOVABLE); return 0; } - if (!isolate_lru_page(page)) + if (!folio_isolate_lru(folio)) return 0; - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), + node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), nr_pages); /* - * Isolating the page has taken another reference, so the - * caller's reference can be safely dropped without the page + * Isolating the folio has taken another reference, so the + * caller's reference can be safely dropped without the folio * disappearing underneath us during migration. */ - put_page(page); + folio_put(folio); return 1; } @@ -2545,7 +2545,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (page_is_file_lru(page) && PageDirty(page)) goto out; - isolated = numamigrate_isolate_page(pgdat, page); + isolated = numamigrate_isolate_folio(pgdat, page_folio(page)); if (!isolated) goto out; -- cgit v1.2.3 From 73eab3ca481e5be0f1fd8140365d604482f84ee1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:27 +0800 Subject: mm: migrate: convert migrate_misplaced_page() to migrate_misplaced_folio() At present, numa balance only support base page and PMD-mapped THP, but we will expand to support to migrate large folio/pte-mapped THP in the future, it is better to make migrate_misplaced_page() to take a folio instead of a page, and rename it to migrate_misplaced_folio(), it is a preparation, also this remove several compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- include/linux/migrate.h | 4 ++-- mm/huge_memory.c | 2 +- mm/memory.c | 2 +- mm/migrate.c | 39 +++++++++++++++++++++------------------ 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 711dd9412561..2ce13e8a309b 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -142,10 +142,10 @@ const struct movable_operations *page_movable_ops(struct page *page) } #ifdef CONFIG_NUMA_BALANCING -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, +int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node); #else -static inline int migrate_misplaced_page(struct page *page, +static inline int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int node) { return -EAGAIN; /* can't migrate now */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cfd83e91748..5c20e43782e4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1567,7 +1567,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); writable = false; - migrated = migrate_misplaced_page(page, vma, target_nid); + migrated = migrate_misplaced_folio(page_folio(page), vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; diff --git a/mm/memory.c b/mm/memory.c index 0739ccb00e61..d956b231e835 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4812,7 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_page(page, vma, target_nid)) { + if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { page_nid = target_nid; flags |= TNF_MIGRATED; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 1f1aebe8da18..1b848f6b5fbc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2516,55 +2516,58 @@ static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio) } /* - * Attempt to migrate a misplaced page to the specified destination + * Attempt to migrate a misplaced folio to the specified destination * node. Caller is expected to have an elevated reference count on - * the page that will be dropped by this function before returning. + * the folio that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, - int node) +int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, + int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); - int nr_pages = thp_nr_pages(page); + int nr_pages = folio_nr_pages(folio); /* - * Don't migrate file pages that are mapped in multiple processes + * Don't migrate file folios that are mapped in multiple processes * with execute permissions as they are probably shared libraries. + * To check if the folio is shared, ideally we want to make sure + * every page is mapped to the same process. Doing that is very + * expensive, so check the estimated mapcount of the folio instead. */ - if (page_mapcount(page) != 1 && page_is_file_lru(page) && + if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) && (vma->vm_flags & VM_EXEC)) goto out; /* - * Also do not migrate dirty pages as not all filesystems can move - * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + * Also do not migrate dirty folios as not all filesystems can move + * dirty folios in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_lru(page) && PageDirty(page)) + if (folio_is_file_lru(folio) && folio_test_dirty(folio)) goto out; - isolated = numamigrate_isolate_folio(pgdat, page_folio(page)); + isolated = numamigrate_isolate_folio(pgdat, folio); if (!isolated) goto out; - list_add(&page->lru, &migratepages); + list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, NULL, node, MIGRATE_ASYNC, MR_NUMA_MISPLACED, &nr_succeeded); if (nr_remaining) { if (!list_empty(&migratepages)) { - list_del(&page->lru); - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -nr_pages); - putback_lru_page(page); + list_del(&folio->lru); + node_stat_mod_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio), -nr_pages); + folio_putback_lru(folio); } isolated = 0; } if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); - if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node)) + if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, nr_succeeded); } @@ -2572,7 +2575,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, return isolated; out: - put_page(page); + folio_put(folio); return 0; } #endif /* CONFIG_NUMA_BALANCING */ -- cgit v1.2.3 From 7e2a5e5ab217d5e4166cdbdf4af8c5e34b6200bb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:28 +0800 Subject: mm: migrate: use __folio_test_movable() Use __folio_test_movable(), no need to convert from folio to page again. Link: https://lkml.kernel.org/r/20230913095131.2426871-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Zi Yan Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1b848f6b5fbc..2a826da603bf 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -157,8 +157,8 @@ void putback_movable_pages(struct list_head *l) list_del(&folio->lru); /* * We isolated non-lru movable folio so here we can use - * __PageMovable because LRU folio's mapping cannot have - * PAGE_MAPPING_MOVABLE. + * __folio_test_movable because LRU folio's mapping cannot + * have PAGE_MAPPING_MOVABLE. */ if (unlikely(__folio_test_movable(folio))) { VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio); @@ -946,7 +946,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, enum migrate_mode mode) { int rc = -EAGAIN; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); VM_BUG_ON_FOLIO(!folio_test_locked(src), src); VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst); @@ -993,7 +993,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, * src is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - if (__PageMovable(&src->page)) { + if (__folio_test_movable(src)) { VM_BUG_ON_FOLIO(!folio_test_isolated(src), src); /* @@ -1085,7 +1085,7 @@ static void migrate_folio_done(struct folio *src, /* * Compaction can migrate also non-LRU pages which are * not accounted to NR_ISOLATED_*. They can be recognized - * as __PageMovable + * as __folio_test_movable */ if (likely(!__folio_test_movable(src))) mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + @@ -1106,7 +1106,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, int rc = -EAGAIN; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); bool locked = false; bool dst_locked = false; @@ -1264,7 +1264,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, int rc; int page_was_mapped = 0; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(&src->page); + bool is_lru = !__folio_test_movable(src); struct list_head *prev; __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); -- cgit v1.2.3 From d64cfccbc805663a2c5691f638cf9198b9676a9f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:29 +0800 Subject: mm: migrate: use a folio in add_page_for_migration() Use a folio in add_page_for_migration() to save compound_head() calls. Link: https://lkml.kernel.org/r/20230913095131.2426871-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 2a826da603bf..ce01d03e635a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2060,6 +2060,7 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, struct vm_area_struct *vma; unsigned long addr; struct page *page; + struct folio *folio; int err; bool isolated; @@ -2082,45 +2083,42 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, if (!page) goto out; - if (is_zone_device_page(page)) - goto out_putpage; + folio = page_folio(page); + if (folio_is_zone_device(folio)) + goto out_putfolio; err = 0; - if (page_to_nid(page) == node) - goto out_putpage; + if (folio_nid(folio) == node) + goto out_putfolio; err = -EACCES; if (page_mapcount(page) > 1 && !migrate_all) - goto out_putpage; + goto out_putfolio; - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { if (PageHead(page)) { - isolated = isolate_hugetlb(page_folio(page), pagelist); + isolated = isolate_hugetlb(folio, pagelist); err = isolated ? 1 : -EBUSY; } } else { - struct page *head; - - head = compound_head(page); - isolated = isolate_lru_page(head); + isolated = folio_isolate_lru(folio); if (!isolated) { err = -EBUSY; - goto out_putpage; + goto out_putfolio; } err = 1; - list_add_tail(&head->lru, pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_lru(head), - thp_nr_pages(head)); + list_add_tail(&folio->lru, pagelist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); } -out_putpage: +out_putfolio: /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. + * Either remove the duplicate refcount from folio_isolate_lru() + * or drop the folio ref if it was not isolated. */ - put_page(page); + folio_put(folio); out: mmap_read_unlock(mm); return err; -- cgit v1.2.3 From b426ed7889be80359cb4edef142e5c5fa697b068 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:30 +0800 Subject: mm: migrate: remove PageHead() check for HugeTLB in add_page_for_migration() There is some different between hugeTLB and THP behave when passed the address of a tail page, for THP, it will migrate the entire THP page, but for HugeTLB, it will return -EACCES, or -ENOENT before commit e66f17ff7177 ("mm/hugetlb: take page table lock in follow_huge_pmd()"), -EACCES The page is mapped by multiple processes and can be moved only if MPOL_MF_MOVE_ALL is specified. -ENOENT The page is not present. But when check manual[1], both of the two errnos are not suitable, it is better to keep the same behave between hugetlb and THP when passed the address of a tail page, so let's just remove the PageHead() check for HugeTLB. [1] https://man7.org/linux/man-pages/man2/move_pages.2.html Link: https://lkml.kernel.org/r/20230913095131.2426871-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Suggested-by: Mike Kravetz Acked-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index ce01d03e635a..8495e7e02dd4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2096,10 +2096,8 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, goto out_putfolio; if (folio_test_hugetlb(folio)) { - if (PageHead(page)) { - isolated = isolate_hugetlb(folio, pagelist); - err = isolated ? 1 : -EBUSY; - } + isolated = isolate_hugetlb(folio, pagelist); + err = isolated ? 1 : -EBUSY; } else { isolated = folio_isolate_lru(folio); if (!isolated) { -- cgit v1.2.3 From fa1df3f6287e1e1fd8b5309828238e2c728e985f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 13 Sep 2023 17:51:31 +0800 Subject: mm: migrate: remove isolated variable in add_page_for_migration() Directly check the return of isolate_hugetlb() and folio_isolate_lru() to remove isolated variable, also setup err = -EBUSY in advance before isolation, and update err only when successfully queued for migration, which could help us to unify and simplify code a bit. Link: https://lkml.kernel.org/r/20230913095131.2426871-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Zi Yan Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/migrate.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 8495e7e02dd4..7d1804c4a5d9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2062,7 +2062,6 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, struct page *page; struct folio *folio; int err; - bool isolated; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); @@ -2095,15 +2094,13 @@ static int add_page_for_migration(struct mm_struct *mm, const void __user *p, if (page_mapcount(page) > 1 && !migrate_all) goto out_putfolio; + err = -EBUSY; if (folio_test_hugetlb(folio)) { - isolated = isolate_hugetlb(folio, pagelist); - err = isolated ? 1 : -EBUSY; + if (isolate_hugetlb(folio, pagelist)) + err = 1; } else { - isolated = folio_isolate_lru(folio); - if (!isolated) { - err = -EBUSY; + if (!folio_isolate_lru(folio)) goto out_putfolio; - } err = 1; list_add_tail(&folio->lru, pagelist); -- cgit v1.2.3 From c603c630b509d690d470b9b49eb96f40482f47d5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 13 Sep 2023 02:20:49 +0000 Subject: mm/damon/core: add a tracepoint for damos apply target regions Patch series "mm/damon: add a tracepoint for damos apply target regions", v2. DAMON provides damon_aggregated tracepoint to let users record full monitoring results. Sometimes, users need to record monitoring results of specific pattern. DAMOS tried regions directory of DAMON sysfs interface allows it, but the interface is mainly designed for snapshots and therefore would be inefficient for such recording. Implement yet another tracepoint for efficient support of the usecase. This patch (of 2): DAMON provides damon_aggregated tracepoint, which exposes details of each region and its access monitoring results. It is useful for getting whole monitoring results, e.g., for recording purposes. For investigations of DAMOS, DAMON Sysfs interface provides DAMOS statistics and tried_regions directory. But, those provides only statistics and snapshots. If the scheme is frequently applied and if the user needs to know every detail of DAMOS behavior, the snapshot-based interface could be insufficient and expensive. As a last resort, userspace users need to record the all monitoring results via damon_aggregated tracepoint and simulate how DAMOS would worked. It is unnecessarily complicated. DAMON kernel API users, meanwhile, can do that easily via before_damos_apply() callback field of 'struct damon_callback', though. Add a tracepoint that will be called just after before_damos_apply() callback for more convenient investigations of DAMOS. The tracepoint exposes all details about each regions, similar to damon_aggregated tracepoint. Please note that DAMOS is currently not only for memory management but also for query-like efficient monitoring results retrievals (when 'stat' action is used). Until now, only statistics or snapshots were supported. Addition of this tracepoint allows efficient full recording of DAMOS-based filtered monitoring results. Link: https://lkml.kernel.org/r/20230913022050.2109-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230913022050.2109-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Steven Rostedt (Google) [tracing] Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 39 +++++++++++++++++++++++++++++++++++++++ mm/damon/core.c | 32 +++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 0b8d13bde17a..19930bb7af9a 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -9,6 +9,45 @@ #include #include +TRACE_EVENT_CONDITION(damos_before_apply, + + TP_PROTO(unsigned int context_idx, unsigned int scheme_idx, + unsigned int target_idx, struct damon_region *r, + unsigned int nr_regions, bool do_trace), + + TP_ARGS(context_idx, target_idx, scheme_idx, r, nr_regions, do_trace), + + TP_CONDITION(do_trace), + + TP_STRUCT__entry( + __field(unsigned int, context_idx) + __field(unsigned int, scheme_idx) + __field(unsigned long, target_idx) + __field(unsigned long, start) + __field(unsigned long, end) + __field(unsigned int, nr_accesses) + __field(unsigned int, age) + __field(unsigned int, nr_regions) + ), + + TP_fast_assign( + __entry->context_idx = context_idx; + __entry->scheme_idx = scheme_idx; + __entry->target_idx = target_idx; + __entry->start = r->ar.start; + __entry->end = r->ar.end; + __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; + __entry->nr_regions = nr_regions; + ), + + TP_printk("ctx_idx=%u scheme_idx=%u target_idx=%lu nr_regions=%u %lu-%lu: %u %u", + __entry->context_idx, __entry->scheme_idx, + __entry->target_idx, __entry->nr_regions, + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age) +); + TRACE_EVENT(damon_aggregated, TP_PROTO(unsigned int target_id, struct damon_region *r, diff --git a/mm/damon/core.c b/mm/damon/core.c index ca631dd88b33..3ca34a252a3c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -950,6 +950,33 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct timespec64 begin, end; unsigned long sz_applied = 0; int err = 0; + /* + * We plan to support multiple context per kdamond, as DAMON sysfs + * implies with 'nr_contexts' file. Nevertheless, only single context + * per kdamond is supported for now. So, we can simply use '0' context + * index here. + */ + unsigned int cidx = 0; + struct damos *siter; /* schemes iterator */ + unsigned int sidx = 0; + struct damon_target *titer; /* targets iterator */ + unsigned int tidx = 0; + bool do_trace = false; + + /* get indices for trace_damos_before_apply() */ + if (trace_damos_before_apply_enabled()) { + damon_for_each_scheme(siter, c) { + if (siter == s) + break; + sidx++; + } + damon_for_each_target(titer, c) { + if (titer == t) + break; + tidx++; + } + do_trace = true; + } if (c->ops.apply_scheme) { if (quota->esz && quota->charged_sz + sz > quota->esz) { @@ -964,8 +991,11 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) err = c->callback.before_damos_apply(c, t, r, s); - if (!err) + if (!err) { + trace_damos_before_apply(cidx, sidx, tidx, r, + damon_nr_regions(t), do_trace); sz_applied = c->ops.apply_scheme(c, t, r, s); + } ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); -- cgit v1.2.3 From 1b2b7a17ab60ed58875ba6a86ec79fad2f7b7d38 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 13 Sep 2023 02:20:50 +0000 Subject: Docs/admin-guide/mm/damon/usage: document damos_before_apply tracepoint Document damos_before_apply tracepoint on the usage document. Link: https://lkml.kernel.org/r/20230913022050.2109-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 37 ++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 282062b6f134..6272cd36590a 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -496,15 +496,24 @@ the files as above. Above is only for an example. .. _tracepoint: -Tracepoint for Monitoring Results -================================= +Tracepoints for Monitoring Results +================================== Users can get the monitoring results via the :ref:`tried_regions -` or a tracepoint, ``damon:damon_aggregated``. -While the tried regions directory is useful for getting a snapshot, the -tracepoint is useful for getting a full record of the results. While the -monitoring is turned on, you could record the tracepoint events and show -results using tracepoint supporting tools like ``perf``. For example:: +`. The interface is useful for getting a +snapshot, but it could be inefficient for fully recording all the monitoring +results. For the purpose, two trace points, namely ``damon:damon_aggregated`` +and ``damon:damos_before_apply``, are provided. ``damon:damon_aggregated`` +provides the whole monitoring results, while ``damon:damos_before_apply`` +provides the monitoring results for regions that each DAMON-based Operation +Scheme (:ref:`DAMOS `) is gonna be applied. Hence, +``damon:damos_before_apply`` is more useful for recording internal behavior of +DAMOS, or DAMOS target access +:ref:`pattern ` based query-like efficient +monitoring results recording. + +While the monitoring is turned on, you could record the tracepoint events and +show results using tracepoint supporting tools like ``perf``. For example:: # echo on > monitor_on # perf record -e damon:damon_aggregated & @@ -527,6 +536,20 @@ counter). Finally the tenth field (``X``) shows the ``age`` of the region (refer to :ref:`design ` for more details of the counter). +If the event was ``damon:damos_beofre_apply``, the ``perf script`` output would +be somewhat like below:: + + kdamond.0 47293 [000] 80801.060214: damon:damos_before_apply: ctx_idx=0 scheme_idx=0 target_idx=0 nr_regions=11 121932607488-135128711168: 0 136 + [...] + +Each line of the output represents each monitoring region that each DAMON-based +Operation Scheme was about to be applied at the traced time. The first five +fields are as usual. It shows the index of the DAMON context (``ctx_idx=X``) +of the scheme in the list of the contexts of the context's kdamond, the index +of the scheme (``scheme_idx=X``) in the list of the schemes of the context, in +addition to the output of ``damon_aggregated`` tracepoint. + + .. _debugfs_interface: debugfs Interface (DEPRECATED!) -- cgit v1.2.3 From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:04 +0100 Subject: buffer: pass GFP flags to folio_alloc_buffers() Patch series "Add and use bdev_getblk()", v2. This patch series fixes a bug reported by Hui Zhu; see proposed patches v1 and v2: https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/ https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/ I decided to go in a rather different direction for this fix, and fix a related problem at the same time. I don't think there's any urgency to rush this into Linus' tree, nor have I marked it for stable. Reasonable people may disagree. This patch (of 8): Instead of creating entirely new flags, inherit them from grow_dev_page(). The other callers create the same flags that this function used to create. Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 17 +++++++++-------- include/linux/buffer_head.h | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..32338b7cfeb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -915,16 +915,12 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry) + gfp_t gfp) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; struct mem_cgroup *memcg, *old_memcg; - if (retry) - gfp |= __GFP_NOFAIL; - /* The folio lock pins the memcg */ memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); @@ -967,7 +963,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { - return folio_alloc_buffers(page_folio(page), size, retry); + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; + if (retry) + gfp |= __GFP_NOFAIL; + + return folio_alloc_buffers(page_folio(page), size, gfp); } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1069,7 +1069,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, true); + bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); /* * Link the folio to the buffers and initialise them. Take the @@ -1644,8 +1644,9 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; - head = folio_alloc_buffers(folio, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, gfp); bh = head; do { bh->b_state |= b_state; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 44e9de51eedf..67d94d2be475 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -198,7 +198,7 @@ void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry); + gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, -- cgit v1.2.3 From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:05 +0100 Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp() grow_dev_page() is only called by grow_buffers(). grow_buffers() is only called by __getblk_slow() and __getblk_slow() is only called from __getblk_gfp(), so it is safe to move the GFP flags setting all the way up. With that done, add a new bdev_getblk() entry point that leaves the GFP flags the way the caller specified them. [willy@infradead.org: fix grow_dev_page() error handling] Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/buffer.c | 61 +++++++++++++++++++++++++++++---------------- include/linux/buffer_head.h | 2 ++ 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 32338b7cfeb9..80e96c1fcd33 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1043,20 +1043,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; - gfp_t gfp_mask; - - gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; - - /* - * XXX: __getblk_slow() can not really deal with failure and - * will endlessly loop on improvised global reclaim. Prefer - * looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp_mask |= __GFP_NOFAIL; folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); bh = folio_buffers(folio); if (bh) { @@ -1069,7 +1060,10 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); + ret = -ENOMEM; + bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); + if (!bh) + goto failed; /* * Link the folio to the buffers and initialise them. Take the @@ -1420,24 +1414,49 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__find_get_block); +/** + * bdev_getblk - Get a buffer_head in a block device's buffer cache. + * @bdev: The block device. + * @block: The block number. + * @size: The size of buffer_heads for this @bdev. + * @gfp: The memory allocation flags to use. + * + * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; + * they are not augmented with the mapping's GFP flags. + * + * Return: The buffer head, or NULL if memory could not be allocated. + */ +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_alloc(gfp); + if (bh) + return bh; + + return __getblk_slow(bdev, block, size, gfp); +} +EXPORT_SYMBOL(bdev_getblk); + /* * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. - * - * __getblk_gfp() will lock up the machine if grow_dev_page's - * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * __getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - might_sleep(); - if (bh == NULL) - bh = __getblk_slow(bdev, block, size, gfp); - return bh; + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + return bdev_getblk(bdev, block, size, gfp); } EXPORT_SYMBOL(__getblk_gfp); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 67d94d2be475..7825bb3d63a7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -227,6 +227,8 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); -- cgit v1.2.3 From e509ad4d77e6c8852f082104e388110d16fdfb97 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:06 +0100 Subject: ext4: use bdev_getblk() to avoid memory reclaim in readahead path sb_getblk_gfp adds __GFP_NOFAIL, which is unnecessary for readahead; we're quite comfortable with the possibility that we may not get a bh back. Switch to bdev_getblk() which does not include __GFP_NOFAIL. Link: https://lkml.kernel.org/r/20230914150011.843330-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Hui Zhu Closes: https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/ Signed-off-by: Andrew Morton --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dbebd8b3127e..268d812b0add 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -255,7 +255,8 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) -- cgit v1.2.3 From 775d9b10530a0a303dc08a9cdb2ed1f8667d9c5f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:07 +0100 Subject: buffer: use bdev_getblk() to avoid memory reclaim in readahead path __getblk() adds __GFP_NOFAIL, which is unnecessary for readahead; we're quite comfortable with the possibility that we may not get a bh back. Switch to bdev_getblk() which does not include __GFP_NOFAIL. Link: https://lkml.kernel.org/r/20230914150011.843330-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index 80e96c1fcd33..edd118594565 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1465,7 +1465,9 @@ EXPORT_SYMBOL(__getblk_gfp); */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = bdev_getblk(bdev, block, size, + GFP_NOWAIT | __GFP_MOVABLE); + if (likely(bh)) { bh_readahead(bh, REQ_RAHEAD); brelse(bh); -- cgit v1.2.3 From c645e65c0675dd4df7ee68b995154dc1c1e7ce3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:08 +0100 Subject: buffer: convert getblk_unmovable() and __getblk() to use bdev_getblk() Move these two functions up in the file for the benefit of the next patch, and pass in all of the GFP flags to use instead of the partial GFP flags used by __getblk_gfp(). Link: https://lkml.kernel.org/r/20230914150011.843330-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7825bb3d63a7..9a3ca5f6d63d 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -340,6 +340,28 @@ sb_breadahead(struct super_block *sb, sector_t block) __breadahead(sb->s_bdev, block, sb->s_blocksize); } +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, + sector_t block, unsigned size) +{ + gfp_t gfp; + + gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + gfp |= __GFP_NOFAIL; + + return bdev_getblk(bdev, block, size, gfp); +} + +static inline struct buffer_head *__getblk(struct block_device *bdev, + sector_t block, unsigned size) +{ + gfp_t gfp; + + gfp = mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + gfp |= __GFP_MOVABLE | __GFP_NOFAIL; + + return bdev_getblk(bdev, block, size, gfp); +} + static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { @@ -387,20 +409,6 @@ static inline void lock_buffer(struct buffer_head *bh) __lock_buffer(bh); } -static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, - sector_t block, - unsigned size) -{ - return __getblk_gfp(bdev, block, size, 0); -} - -static inline struct buffer_head *__getblk(struct block_device *bdev, - sector_t block, - unsigned size) -{ - return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); -} - static inline void bh_readahead(struct buffer_head *bh, blk_opf_t op_flags) { if (!buffer_uptodate(bh) && trylock_buffer(bh)) { -- cgit v1.2.3 From 4b9c8b1919323f7f359376ca31a4c721cb2b3acf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:09 +0100 Subject: buffer: convert sb_getblk() to call __getblk() Now that __getblk() is in the right place in the file, it is trivial to call it from sb_getblk(). Link: https://lkml.kernel.org/r/20230914150011.843330-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 9a3ca5f6d63d..b294e2cccbae 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -362,13 +362,12 @@ static inline struct buffer_head *__getblk(struct block_device *bdev, return bdev_getblk(bdev, block, size, gfp); } -static inline struct buffer_head * -sb_getblk(struct super_block *sb, sector_t block) +static inline struct buffer_head *sb_getblk(struct super_block *sb, + sector_t block) { - return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); + return __getblk(sb->s_bdev, block, sb->s_blocksize); } - static inline struct buffer_head * sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) { -- cgit v1.2.3 From 8a83ac54940d27b8f56d766e1cb270d150fedd50 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:10 +0100 Subject: ext4: call bdev_getblk() from sb_getblk_gfp() Most of the callers of sb_getblk_gfp() already assumed that they were passing the entire GFP flags to use. Fix up the two callers that didn't, and remove the __GFP_NOFAIL from them since they both appear to correctly handle failure. Link: https://lkml.kernel.org/r/20230914150011.843330-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/ext4/super.c | 10 ++++++++-- include/linux/buffer_head.h | 6 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 268d812b0add..c00ec159dea5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -244,13 +244,19 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - return __ext4_sb_bread_gfp(sb, block, 0, 0); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b294e2cccbae..22f13eece719 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -368,10 +368,10 @@ static inline struct buffer_head *sb_getblk(struct super_block *sb, return __getblk(sb->s_bdev, block, sb->s_blocksize); } -static inline struct buffer_head * -sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) +static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb, + sector_t block, gfp_t gfp) { - return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp); + return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp); } static inline struct buffer_head * -- cgit v1.2.3 From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:11 +0100 Subject: buffer: remove __getblk_gfp() Inline it into __bread_gfp(). Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 36 +++++++++++------------------------- include/linux/buffer_head.h | 2 -- 2 files changed, 11 insertions(+), 27 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index edd118594565..edec8652788c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1421,9 +1421,6 @@ EXPORT_SYMBOL(__find_get_block); * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * - * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; - * they are not augmented with the mapping's GFP flags. - * * Return: The buffer head, or NULL if memory could not be allocated. */ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, @@ -1439,27 +1436,6 @@ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(bdev_getblk); -/* - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head - * which corresponds to the passed block_device, block and size. The - * returned buffer has its reference count incremented. - */ -struct buffer_head * -__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp) -{ - gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - - /* - * Prefer looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp |= __GFP_NOFAIL; - - return bdev_getblk(bdev, block, size, gfp); -} -EXPORT_SYMBOL(__getblk_gfp); - /* * Do async read-ahead on a buffer.. */ @@ -1491,7 +1467,17 @@ struct buffer_head * __bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + struct buffer_head *bh; + + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + bh = bdev_getblk(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 22f13eece719..3dc4720e4773 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -229,8 +229,6 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); -struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -- cgit v1.2.3 From 83121580f2eb487b7510fb892674c28412e7c73e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Sep 2023 15:16:38 +0200 Subject: trace-vmscan-postprocess: sync with tracepoints updates The script has fallen behind tracepoint changes for a while, fix it up. Most changes are mechanical (renames, removal of tracepoint parameters that are not used by the script). More notable change involves mm_vmscan_lru_isolate which is relying on the isolate_mode to determine if the inactive list is being scanned. However the parameter currently only indicates ISOLATE_UNMAPPED. We can use the lru parameter instead to determine which list is scanned, and stop checking isolate_mode. Link: https://lkml.kernel.org/r/20230914131637.12204-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Mel Gorman Signed-off-by: Andrew Morton --- .../trace/postprocess/trace-vmscan-postprocess.pl | 40 +++++++++------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index e24c009789a0..725d41a8d4ef 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -107,14 +107,14 @@ GetOptions( ); # Defaults for dynamically discovered regex's -my $regex_direct_begin_default = 'order=([0-9]*) may_writepage=([0-9]*) gfp_flags=([A-Z_|]*)'; +my $regex_direct_begin_default = 'order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; -my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; +my $regex_wakeup_kswapd_default = 'nid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; +my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; -my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; +my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_taken=([0-9]*) nr_active=([0-9]*) nr_deactivated=([0-9]*) nr_referenced=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)' ; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; # Dyanically discovered regex @@ -184,8 +184,7 @@ sub generate_traceevent_regex { $regex_direct_begin = generate_traceevent_regex( "vmscan/mm_vmscan_direct_reclaim_begin", $regex_direct_begin_default, - "order", "may_writepage", - "gfp_flags"); + "order", "gfp_flags"); $regex_direct_end = generate_traceevent_regex( "vmscan/mm_vmscan_direct_reclaim_end", $regex_direct_end_default, @@ -201,11 +200,11 @@ $regex_kswapd_sleep = generate_traceevent_regex( $regex_wakeup_kswapd = generate_traceevent_regex( "vmscan/mm_vmscan_wakeup_kswapd", $regex_wakeup_kswapd_default, - "nid", "zid", "order", "gfp_flags"); + "nid", "order", "gfp_flags"); $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, - "isolate_mode", "classzone_idx", "order", + "isolate_mode", classzone", "order", "nr_requested", "nr_scanned", "nr_skipped", "nr_taken", "lru"); $regex_lru_shrink_inactive = generate_traceevent_regex( @@ -218,11 +217,10 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, - "nid", "zid", - "lru", - "nr_scanned", "nr_rotated", "priority"); + "nid", "nr_taken", "nr_active", "nr_deactivated", "nr_referenced", + "priority", "flags"); $regex_writepage = generate_traceevent_regex( - "vmscan/mm_vmscan_writepage", + "vmscan/mm_vmscan_write_folio", $regex_writepage_default, "page", "pfn", "flags"); @@ -371,7 +369,7 @@ EVENT_PROCESS: print " $regex_wakeup_kswapd\n"; next; } - my $order = $3; + my $order = $2; $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD_PERORDER}[$order]++; } elsif ($tracepoint eq "mm_vmscan_lru_isolate") { $details = $6; @@ -381,18 +379,14 @@ EVENT_PROCESS: print " $regex_lru_isolate/o\n"; next; } - my $isolate_mode = $1; my $nr_scanned = $5; - my $file = $8; - - # To closer match vmstat scanning statistics, only count isolate_both - # and isolate_inactive as scanning. isolate_active is rotation - # isolate_inactive == 1 - # isolate_active == 2 - # isolate_both == 3 - if ($isolate_mode != 2) { + my $lru = $8; + + # To closer match vmstat scanning statistics, only count + # inactive lru as scanning + if ($lru =~ /inactive_/) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; - if ($file =~ /_file/) { + if ($lru =~ /_file/) { $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; } else { $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; -- cgit v1.2.3 From 3dfbb555c98ac55b9d911f9af0e35014b445fb41 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 14 Sep 2023 15:16:39 +0200 Subject: mm, vmscan: remove ISOLATE_UNMAPPED This isolate_mode_t flag is effectively unused since 89f6c88a6ab4 ("mm: __isolate_lru_page_prepare() in isolate_migratepages_block()") as sc->may_unmap is now checked directly (and only node_reclaim has a mode that sets it to 0). The last remaining place is mm_vmscan_lru_isolate tracepoint for the isolate_mode parameter. That one was mainly used to indicate the active/inactive mode, which the trace-vmscan-postprocess.pl script consumed, but that got silently broken. After fixing the script by the previous patch, it does not need the isolate_mode anymore. So just remove the parameter and with that the whole ISOLATE_UNMAPPED flag. Link: https://lkml.kernel.org/r/20230914131637.12204-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/trace/postprocess/trace-vmscan-postprocess.pl | 8 ++++---- include/linux/mmzone.h | 2 -- include/trace/events/vmscan.h | 8 ++------ mm/vmscan.c | 3 +-- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 725d41a8d4ef..048dc0dbce64 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -112,7 +112,7 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; +my $regex_lru_isolate_default = 'classzone=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_taken=([0-9]*) nr_active=([0-9]*) nr_deactivated=([0-9]*) nr_referenced=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)' ; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -204,7 +204,7 @@ $regex_wakeup_kswapd = generate_traceevent_regex( $regex_lru_isolate = generate_traceevent_regex( "vmscan/mm_vmscan_lru_isolate", $regex_lru_isolate_default, - "isolate_mode", classzone", "order", + "classzone", "order", "nr_requested", "nr_scanned", "nr_skipped", "nr_taken", "lru"); $regex_lru_shrink_inactive = generate_traceevent_regex( @@ -379,8 +379,8 @@ EVENT_PROCESS: print " $regex_lru_isolate/o\n"; next; } - my $nr_scanned = $5; - my $lru = $8; + my $nr_scanned = $4; + my $lru = $7; # To closer match vmstat scanning statistics, only count # inactive lru as scanning diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4106fbc5b4b3..486587fcd27f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,8 +639,6 @@ struct lruvec { #endif }; -/* Isolate unmapped pages */ -#define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d2123dd960d5..1a488c30afa5 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -285,10 +285,9 @@ TRACE_EVENT(mm_vmscan_lru_isolate, unsigned long nr_scanned, unsigned long nr_skipped, unsigned long nr_taken, - isolate_mode_t isolate_mode, int lru), - TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, isolate_mode, lru), + TP_ARGS(highest_zoneidx, order, nr_requested, nr_scanned, nr_skipped, nr_taken, lru), TP_STRUCT__entry( __field(int, highest_zoneidx) @@ -297,7 +296,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __field(unsigned long, nr_scanned) __field(unsigned long, nr_skipped) __field(unsigned long, nr_taken) - __field(unsigned int, isolate_mode) __field(int, lru) ), @@ -308,7 +306,6 @@ TRACE_EVENT(mm_vmscan_lru_isolate, __entry->nr_scanned = nr_scanned; __entry->nr_skipped = nr_skipped; __entry->nr_taken = nr_taken; - __entry->isolate_mode = (__force unsigned int)isolate_mode; __entry->lru = lru; ), @@ -316,8 +313,7 @@ TRACE_EVENT(mm_vmscan_lru_isolate, * classzone is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ - TP_printk("isolate_mode=%d classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", - __entry->isolate_mode, + TP_printk("classzone=%d order=%d nr_requested=%lu nr_scanned=%lu nr_skipped=%lu nr_taken=%lu lru=%s", __entry->highest_zoneidx, __entry->order, __entry->nr_requested, diff --git a/mm/vmscan.c b/mm/vmscan.c index acf115468bf8..3df0e2a59052 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1689,8 +1689,7 @@ move: } *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - total_scan, skipped, nr_taken, - sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru); + total_scan, skipped, nr_taken, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } -- cgit v1.2.3 From 2e7cfe5cd5b6b0b98abf57a3074885979e187c1c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:44 -0400 Subject: mm/cma: use nth_page() in place of direct struct page manipulation Patch series "Use nth_page() in place of direct struct page manipulation", v3. On SPARSEMEM without VMEMMAP, struct page is not guaranteed to be contiguous, since each memory section's memmap might be allocated independently. hugetlb pages can go beyond a memory section size, thus direct struct page manipulation on hugetlb pages/subpages might give wrong struct page. Kernel provides nth_page() to do the manipulation properly. Use that whenever code can see hugetlb pages. This patch (of 5): When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. Without the fix, page_kasan_tag_reset() could reset wrong page tags, causing a wrong kasan result. No related bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20230913201248.452081-2-zi.yan@sent.com Fixes: 2813b9c02962 ("kasan, mm, arm64: tag non slab memory allocated via pagealloc") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index da2967c6a223..2b2494fd6b59 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -505,7 +505,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, */ if (page) { for (i = 0; i < count; i++) - page_kasan_tag_reset(page + i); + page_kasan_tag_reset(nth_page(page, i)); } if (ret && !no_warn) { -- cgit v1.2.3 From 426056efe835cf4864ccf4c328fe3af9146fc539 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:45 -0400 Subject: mm/hugetlb: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. A wrong or non-existing page might be tried to be grabbed, either leading to a non freeable page or kernel memory access errors. No bug is reported. It comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-3-zi.yan@sent.com Fixes: 57a196a58421 ("hugetlb: simplify hugetlb handling in follow_page_mask") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7c90c43574a6..a945efe2858a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6493,7 +6493,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, } } - page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = nth_page(page, ((address & ~huge_page_mask(h)) >> PAGE_SHIFT)); /* * Note that page may be a sub-page, and with vmemmap -- cgit v1.2.3 From 1640a0ef80f6d572725f5b0330038c18e98ea168 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:46 -0400 Subject: mm/memory_hotplug: use pfn math in place of direct struct page manipulation When dealing with hugetlb pages, manipulating struct page pointers directly can get to wrong struct page, since struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use pfn calculation to handle it properly. Without the fix, a wrong number of page might be skipped. Since skip cannot be negative, scan_movable_page() will end early and might miss a movable page with -ENOENT. This might fail offline_pages(). No bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-4-zi.yan@sent.com Fixes: eeb0efd071d8 ("mm,memory_hotplug: fix scan_movable_pages() for gigantic hugepages") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b03f4ec6fd2..3b301c4023ff 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1689,7 +1689,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, */ if (HPageMigratable(head)) goto found; - skip = compound_nr(head) - (page - head); + skip = compound_nr(head) - (pfn - page_to_pfn(head)); pfn += skip - 1; } return -ENOENT; -- cgit v1.2.3 From 8db0ec791f7788cd21e7f91ee5ff42c1c458d0e7 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:47 -0400 Subject: fs: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. Without the fix, a wrong subpage might be checked for HWPoison, causing wrong number of bytes of a page copied to user space. No bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-5-zi.yan@sent.com Fixes: 38c1ddbde6c6 ("hugetlbfs: improve read HWPOISON hugepage") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 316c4cebd3f3..60fce26ff937 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -295,7 +295,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt size_t res = 0; /* First subpage to start the loop. */ - page += offset / PAGE_SIZE; + page = nth_page(page, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) @@ -309,7 +309,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt break; offset += n; if (offset == PAGE_SIZE) { - page++; + page = nth_page(page, 1); offset = 0; } } -- cgit v1.2.3 From aa5fe31b6b59210cb4ea28a59e68781f48eeca74 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:48 -0400 Subject: mips: use nth_page() in place of direct struct page manipulation __flush_dcache_pages() is called during hugetlb migration via migrate_pages() -> migrate_hugetlbs() -> unmap_and_move_huge_page() -> move_to_new_folio() -> flush_dcache_folio(). And with hugetlb and without sparsemem vmemmap, struct page is not guaranteed to be contiguous beyond a section. Use nth_page() instead. Without the fix, a wrong address might be used for data cache page flush. No bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-6-zi.yan@sent.com Fixes: 15fa3e8e3269 ("mips: implement the new page table range API") Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- arch/mips/mm/cache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 02042100e267..7f830634dbe7 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -117,7 +117,7 @@ void __flush_dcache_pages(struct page *page, unsigned int nr) * get faulted into the tlb (and thus flushed) anyways. */ for (i = 0; i < nr; i++) { - addr = (unsigned long)kmap_local_page(page + i); + addr = (unsigned long)kmap_local_page(nth_page(page, i)); flush_data_cache_page(addr); kunmap_local((void *)addr); } -- cgit v1.2.3 From 4472edf63d6630e6cf65e205b4fc8c3c94d0afe5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 14 Sep 2023 02:15:23 +0000 Subject: mm/damon/core: use number of passed access sampling as a timer DAMON sleeps for sampling interval after each sampling, and check if the aggregation interval and the ops update interval have passed using ktime_get_coarse_ts64() and baseline timestamps for the intervals. That design is for making the operations occur at deterministic timing regardless of the time that spend for each work. However, it turned out it is not that useful, and incur not-that-intuitive results. After all, timer functions, and especially sleep functions that DAMON uses to wait for specific timing, are not necessarily strictly accurate. It is legal design, so no problem. However, depending on such inaccuracies, the nr_accesses can be larger than aggregation interval divided by sampling interval. For example, with the default setting (5 ms sampling interval and 100 ms aggregation interval) we frequently show regions having nr_accesses larger than 20. Also, if the execution of a DAMOS scheme takes a long time, next aggregation could happen before enough number of samples are collected. This is not what usual users would intuitively expect. Since access check sampling is the smallest unit work of DAMON, using the number of passed sampling intervals as the DAMON-internal timer can easily avoid these problems. That is, convert aggregation and ops update intervals to numbers of sampling intervals that need to be passed before those operations be executed, count the number of passed sampling intervals, and invoke the operations as soon as the specific amount of sampling intervals passed. Make the change. Note that this could make a behavioral change to settings that using intervals that not aligned by the sampling interval. For example, if the sampling interval is 5 ms and the aggregation interval is 12 ms, DAMON effectively uses 15 ms as its aggregation interval, because it checks whether the aggregation interval after sleeping the sampling interval. This change will make DAMON to effectively use 10 ms as aggregation interval, since it uses 'aggregation interval / sampling interval * sampling interval' as the effective aggregation interval, and we don't use floating point types. Usual users would have used aligned intervals, so this behavioral change is not expected to make any meaningful impact, so just make this change. Link: https://lkml.kernel.org/r/20230914021523.60649-1-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 14 ++++++-- mm/damon/core.c | 96 +++++++++++++++++++++++++-------------------------- 2 files changed, 59 insertions(+), 51 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index ab3089de1478..9a32b8fd0bd3 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -524,8 +524,18 @@ struct damon_ctx { struct damon_attrs attrs; /* private: internal use only */ - struct timespec64 last_aggregation; - struct timespec64 last_ops_update; + /* number of sample intervals that passed since this context started */ + unsigned long passed_sample_intervals; + /* + * number of sample intervals that should be passed before next + * aggregation + */ + unsigned long next_aggregation_sis; + /* + * number of sample intervals that should be passed before next ops + * update + */ + unsigned long next_ops_update_sis; /* public: */ struct task_struct *kdamond; diff --git a/mm/damon/core.c b/mm/damon/core.c index 3ca34a252a3c..c5b7296c69a0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -427,8 +427,10 @@ struct damon_ctx *damon_new_ctx(void) ctx->attrs.aggr_interval = 100 * 1000; ctx->attrs.ops_update_interval = 60 * 1000 * 1000; - ktime_get_coarse_ts64(&ctx->last_aggregation); - ctx->last_ops_update = ctx->last_aggregation; + ctx->passed_sample_intervals = 0; + /* These will be set from kdamond_init_intervals_sis() */ + ctx->next_aggregation_sis = 0; + ctx->next_ops_update_sis = 0; mutex_init(&ctx->kdamond_lock); @@ -552,6 +554,9 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, */ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { + unsigned long sample_interval = attrs->sample_interval ? + attrs->sample_interval : 1; + if (attrs->min_nr_regions < 3) return -EINVAL; if (attrs->min_nr_regions > attrs->max_nr_regions) @@ -559,6 +564,11 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->sample_interval > attrs->aggr_interval) return -EINVAL; + ctx->next_aggregation_sis = ctx->passed_sample_intervals + + attrs->aggr_interval / sample_interval; + ctx->next_ops_update_sis = ctx->passed_sample_intervals + + attrs->ops_update_interval / sample_interval; + damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; return 0; @@ -732,38 +742,6 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs) return err; } -/* - * damon_check_reset_time_interval() - Check if a time interval is elapsed. - * @baseline: the time to check whether the interval has elapsed since - * @interval: the time interval (microseconds) - * - * See whether the given time interval has passed since the given baseline - * time. If so, it also updates the baseline to current time for next check. - * - * Return: true if the time interval has passed, or false otherwise. - */ -static bool damon_check_reset_time_interval(struct timespec64 *baseline, - unsigned long interval) -{ - struct timespec64 now; - - ktime_get_coarse_ts64(&now); - if ((timespec64_to_ns(&now) - timespec64_to_ns(baseline)) < - interval * 1000) - return false; - *baseline = now; - return true; -} - -/* - * Check whether it is time to flush the aggregated information - */ -static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) -{ - return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->attrs.aggr_interval); -} - /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1274,18 +1252,6 @@ static void kdamond_split_regions(struct damon_ctx *ctx) last_nr_regions = nr_regions; } -/* - * Check whether it is time to check and apply the operations-related data - * structures. - * - * Returns true if it is. - */ -static bool kdamond_need_update_operations(struct damon_ctx *ctx) -{ - return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->attrs.ops_update_interval); -} - /* * Check whether current monitoring should be stopped * @@ -1397,6 +1363,17 @@ static int kdamond_wait_activation(struct damon_ctx *ctx) return -EBUSY; } +static void kdamond_init_intervals_sis(struct damon_ctx *ctx) +{ + unsigned long sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + + ctx->passed_sample_intervals = 0; + ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; + ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / + sample_interval; +} + /* * The monitoring daemon that runs as a kernel thread */ @@ -1410,6 +1387,8 @@ static int kdamond_fn(void *data) pr_debug("kdamond (%d) starts\n", current->pid); + kdamond_init_intervals_sis(ctx); + if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) @@ -1418,6 +1397,17 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); while (!kdamond_need_stop(ctx)) { + /* + * ctx->attrs and ctx->next_{aggregation,ops_update}_sis could + * be changed from after_wmarks_check() or after_aggregation() + * callbacks. Read the values here, and use those for this + * iteration. That is, damon_set_attrs() updated new values + * are respected from next iteration. + */ + unsigned long next_aggregation_sis = ctx->next_aggregation_sis; + unsigned long next_ops_update_sis = ctx->next_ops_update_sis; + unsigned long sample_interval = ctx->attrs.sample_interval; + if (kdamond_wait_activation(ctx)) break; @@ -1427,12 +1417,17 @@ static int kdamond_fn(void *data) ctx->callback.after_sampling(ctx)) break; - kdamond_usleep(ctx->attrs.sample_interval); + kdamond_usleep(sample_interval); + ctx->passed_sample_intervals++; if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (kdamond_aggregate_interval_passed(ctx)) { + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + if (ctx->passed_sample_intervals == next_aggregation_sis) { + ctx->next_aggregation_sis = next_aggregation_sis + + ctx->attrs.aggr_interval / sample_interval; kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); @@ -1447,7 +1442,10 @@ static int kdamond_fn(void *data) ctx->ops.reset_aggregated(ctx); } - if (kdamond_need_update_operations(ctx)) { + if (ctx->passed_sample_intervals == next_ops_update_sis) { + ctx->next_ops_update_sis = next_ops_update_sis + + ctx->attrs.ops_update_interval / + sample_interval; if (ctx->ops.update) ctx->ops.update(ctx); sz_limit = damon_region_sz_limit(ctx); -- cgit v1.2.3 From 78fbfb155d204428119310d1b9df665ab88da6e8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:44 +0000 Subject: mm/damon/core: define and use a dedicated function for region access rate update Patch series "mm/damon: provide pseudo-moving sum based access rate". DAMON checks the access to each region for every sampling interval, increase the access rate counter of the region, namely nr_accesses, if the access was made. For every aggregation interval, the counter is reset. The counter is exposed to users to be used as a metric showing the relative access rate (frequency) of each region. In other words, DAMON provides access rate of each region in every aggregation interval. The aggregation avoids temporal access pattern changes making things confusing. However, this also makes a few DAMON-related operations to unnecessarily need to be aligned to the aggregation interval. This can restrict the flexibility of DAMON applications, especially when the aggregation interval is huge. To provide the monitoring results in finer-grained timing while keeping handling of temporal access pattern change, this patchset implements a pseudo-moving sum based access rate metric. It is pseudo-moving sum because strict moving sum implementation would need to keep all values for last time window, and that could incur high overhead of there could be arbitrary number of values in a time window. Especially in case of the nr_accesses, since the sampling interval and aggregation interval can arbitrarily set and the past values should be maintained for every region, it could be risky. The pseudo-moving sum assumes there were no temporal access pattern change in last discrete time window to remove the needs for keeping the list of the last time window values. As a result, it beocmes not strict moving sum implementation, but provides a reasonable accuracy. Also, it keeps an important property of the moving sum. That is, the moving sum becomes same to discrete-window based sum at the time that aligns to the time window. This means using the pseudo moving sum based nr_accesses makes no change to users who shows the value for every aggregation interval. Patches Sequence ---------------- The sequence of the patches is as follows. The first four patches are for preparation of the change. The first two (patches 1 and 2) implements a helper function for nr_accesses update and eliminate corner case that skips use of the function, respectively. Following two (patches 3 and 4) respectively implement the pseudo-moving sum function and its simple unit test case. Two patches for making DAMON to use the pseudo-moving sum follow. The fifthe one (patch 5) introduces a new field for representing the pseudo-moving sum-based access rate of each region, and the sixth one makes the new representation to actually updated with the pseudo-moving sum function. Last two patches (patches 7 and 8) makes followup fixes for skipping unnecessary updates and marking the moving sum function as static, respectively. This patch (of 8): Each DAMON operarions set is updating nr_accesses field of each damon_region for each of their access check results, from the check_accesses() callback. Directly accessing the field could make things complex to manage and change in future. Define and use a dedicated function for the purpose. Link: https://lkml.kernel.org/r/20230915025251.72816-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230915025251.72816-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 ++++- mm/damon/core.c | 16 ++++++++++++++++ mm/damon/paddr.c | 6 ++---- mm/damon/vaddr.c | 6 ++---- 4 files changed, 24 insertions(+), 9 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 9a32b8fd0bd3..17c504d236b9 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -45,7 +45,9 @@ struct damon_addr_range { * * @nr_accesses is reset to zero for every &damon_attrs->aggr_interval and be * increased for every &damon_attrs->sample_interval if an access to the region - * during the last sampling interval is found. + * during the last sampling interval is found. The update of this field should + * not be done with direct access but with the helper function, + * damon_update_region_access_rate(). * * @age is initially zero, increased for each aggregation interval, and reset * to zero again if the access frequency is significantly changed. If two @@ -620,6 +622,7 @@ void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +void damon_update_region_access_rate(struct damon_region *r, bool accessed); struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching); diff --git a/mm/damon/core.c b/mm/damon/core.c index c5b7296c69a0..10532159323a 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1549,6 +1549,22 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return damon_set_regions(t, &addr_range, 1); } +/** + * damon_update_region_access_rate() - Update the access rate of a region. + * @r: The DAMON region to update for its access check result. + * @accessed: Whether the region has accessed during last sampling interval. + * + * Update the access rate of a region with the region's last sampling interval + * access check result. + * + * Usually this will be called by &damon_operations->check_accesses callback. + */ +void damon_update_region_access_rate(struct damon_region *r, bool accessed) +{ + if (accessed) + r->nr_accesses++; +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 909db25efb35..44f21860b555 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -157,14 +157,12 @@ static void __damon_pa_check_access(struct damon_region *r) /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); return; } last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); last_addr = r->sampling_addr; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index cf8a9fc5c9d1..53371bbec605 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -567,14 +567,12 @@ static void __damon_va_check_access(struct mm_struct *mm, /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); - if (last_accessed) - r->nr_accesses++; + damon_update_region_access_rate(r, last_accessed); last_addr = r->sampling_addr; } -- cgit v1.2.3 From 22a7788038a660df8876ba54b941b418d2178f99 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:45 +0000 Subject: mm/damon/vaddr: call damon_update_region_access_rate() always When getting mm_struct of the monitoring target process fails, there wil be no need to increase the access rate counter (nr_accesses) of the regions for the process. Hence, damon_va_check_accesses() skips calling damon_update_region_access_rate() in the case. This breaks the assumption that damon_update_region_access_rate() is called for every region, for every sampling interval. Call the function for every region even in the case. This might increase the overhead in some cases, but such case would not be frequent, so no significant impact is really expected. Link: https://lkml.kernel.org/r/20230915025251.72816-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 53371bbec605..02ab448d9b1e 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -564,6 +564,11 @@ static void __damon_va_check_access(struct mm_struct *mm, static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; + if (!mm) { + damon_update_region_access_rate(r, false); + return; + } + /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { @@ -587,15 +592,14 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { mm = damon_get_mm(t); - if (!mm) - continue; same_target = false; damon_for_each_region(r, t) { __damon_va_check_access(mm, r, same_target); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); same_target = true; } - mmput(mm); + if (mm) + mmput(mm); } return max_nr_accesses; -- cgit v1.2.3 From d2c062ade07ffd206dd16bf085f02abc59651309 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:46 +0000 Subject: mm/damon/core: implement a pseudo-moving sum function For values that continuously change, moving average or sum are good ways to provide fast updates while handling temporal and errorneous variability of the value. For example, the access rate counter (nr_accesses) is calculated as a sum of the number of positive sampled access check results that collected during a discrete time window (aggregation interval), and hence it handles temporal and errorneous access check results, but provides the update only for every aggregation interval. Using a moving sum method for that could allow providing the value for every sampling interval. That could be useful for getting monitoring results snapshot or running DAMOS in fine-grained timing. However, supporting the moving sum for cases that number of samples in the time window is arbirary could impose high overhead, since the number of past values that it needs to keep could be too high. The nr_accesses would also be one of the cases. To mitigate the overhead, implement a pseudo-moving sum function that only provides an estimated pseudo-moving sum. It assumes there was no error in last discrete time window and subtract constant portion of last discrete time window sum. Note that the function is not strictly implementing the moving sum, but it keeps a property of moving sum, which makes the value same to the dsicrete-window based sum for each time window-aligned timing. Hence, people collecting the value in the old timings would show no difference. Link: https://lkml.kernel.org/r/20230915025251.72816-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 17c504d236b9..487a545a11b4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -622,6 +622,8 @@ void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, + unsigned int len_window, unsigned int new_value); void damon_update_region_access_rate(struct damon_region *r, bool accessed); struct damos_filter *damos_new_filter(enum damos_filter_type type, diff --git a/mm/damon/core.c b/mm/damon/core.c index 10532159323a..b005dc15009f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1549,6 +1549,46 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, return damon_set_regions(t, &addr_range, 1); } +/* + * damon_moving_sum() - Calculate an inferred moving sum value. + * @mvsum: Inferred sum of the last @len_window values. + * @nomvsum: Non-moving sum of the last discrete @len_window window values. + * @len_window: The number of last values to take care of. + * @new_value: New value that will be added to the pseudo moving sum. + * + * Moving sum (moving average * window size) is good for handling noise, but + * the cost of keeping past values can be high for arbitrary window size. This + * function implements a lightweight pseudo moving sum function that doesn't + * keep the past window values. + * + * It simply assumes there was no noise in the past, and get the no-noise + * assumed past value to drop from @nomvsum and @len_window. @nomvsum is a + * non-moving sum of the last window. For example, if @len_window is 10 and we + * have 25 values, @nomvsum is the sum of the 11th to 20th values of the 25 + * values. Hence, this function simply drops @nomvsum / @len_window from + * given @mvsum and add @new_value. + * + * For example, if @len_window is 10 and @nomvsum is 50, the last 10 values for + * the last window could be vary, e.g., 0, 10, 0, 10, 0, 10, 0, 0, 0, 20. For + * calculating next moving sum with a new value, we should drop 0 from 50 and + * add the new value. However, this function assumes it got value 5 for each + * of the last ten times. Based on the assumption, when the next value is + * measured, it drops the assumed past value, 5 from the current sum, and add + * the new value to get the updated pseduo-moving average. + * + * This means the value could have errors, but the errors will be disappeared + * for every @len_window aligned calls. For example, if @len_window is 10, the + * pseudo moving sum with 11th value to 19th value would have an error. But + * the sum with 20th value will not have the error. + * + * Return: Pseudo-moving average after getting the @new_value. + */ +unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, + unsigned int len_window, unsigned int new_value) +{ + return mvsum - nomvsum / len_window + new_value; +} + /** * damon_update_region_access_rate() - Update the access rate of a region. * @r: The DAMON region to update for its access check result. -- cgit v1.2.3 From 0926e8ff96b58845e802438374caaca53ab36053 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:47 +0000 Subject: mm/damon/core-test: add a unit test for damon_moving_sum() Add a simple unit test for the pseudo moving-sum function (damon_moving_sum()). Link: https://lkml.kernel.org/r/20230915025251.72816-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 6cc8b245586d..c539f0e8377e 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -341,6 +341,21 @@ static void damon_test_set_attrs(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } +static void damon_test_moving_sum(struct kunit *test) +{ + unsigned int mvsum = 50000, nomvsum = 50000, len_window = 10; + unsigned int new_values[] = {10000, 0, 10000, 0, 0, 0, 10000, 0, 0, 0}; + unsigned int expects[] = {55000, 50000, 55000, 50000, 45000, 40000, + 45000, 40000, 35000, 30000}; + int i; + + for (i = 0; i < ARRAY_SIZE(new_values); i++) { + mvsum = damon_moving_sum(mvsum, nomvsum, len_window, + new_values[i]); + KUNIT_EXPECT_EQ(test, mvsum, expects[i]); + } +} + static void damos_test_new_filter(struct kunit *test) { struct damos_filter *filter; @@ -425,6 +440,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_regions), KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), + KUNIT_CASE(damon_test_moving_sum), KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), {}, -- cgit v1.2.3 From 80333828ea7728ebe85d079bb5c1467eb9fc6c8c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:48 +0000 Subject: mm/damon/core: introduce nr_accesses_bp Add yet another representation of the access rate of each region, namely nr_accesses_bp. It is just same to the nr_accesses but represents the value in basis point (1 in 10,000), and updated at once in every aggregation interval. That is, moving_accesses_bp is just nr_accesses * 10000. This may seems useless at the moment. However, it will be useful for representing less than one nr_accesses value that will be needed to make moving sum-based nr_accesses. Link: https://lkml.kernel.org/r/20230915025251.72816-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 5 +++++ mm/damon/core-test.h | 5 +++++ mm/damon/core.c | 6 ++++++ 3 files changed, 16 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 487a545a11b4..15f24b23c9a0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -40,6 +40,7 @@ struct damon_addr_range { * @ar: The address range of the region. * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. + * @nr_accesses_bp: @nr_accesses in basis point (0.01%). * @list: List head for siblings. * @age: Age of this region. * @@ -49,6 +50,9 @@ struct damon_addr_range { * not be done with direct access but with the helper function, * damon_update_region_access_rate(). * + * @nr_accesses_bp is another representation of @nr_accesses in basis point + * (1 in 10,000) that updated every aggregation interval. + * * @age is initially zero, increased for each aggregation interval, and reset * to zero again if the access frequency is significantly changed. If two * regions are merged into a new region, both @nr_accesses and @age of the new @@ -58,6 +62,7 @@ struct damon_region { struct damon_addr_range ar; unsigned long sampling_addr; unsigned int nr_accesses; + unsigned int nr_accesses_bp; struct list_head list; unsigned int age; diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index c539f0e8377e..79f1f12e0dd5 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -94,6 +94,7 @@ static void damon_test_aggregate(struct kunit *test) for (ir = 0; ir < 3; ir++) { r = damon_new_region(saddr[it][ir], eaddr[it][ir]); r->nr_accesses = accesses[it][ir]; + r->nr_accesses_bp = accesses[it][ir] * 10000; damon_add_region(r, t); } it++; @@ -147,9 +148,11 @@ static void damon_test_merge_two(struct kunit *test) t = damon_new_target(); r = damon_new_region(0, 100); r->nr_accesses = 10; + r->nr_accesses_bp = 100000; damon_add_region(r, t); r2 = damon_new_region(100, 300); r2->nr_accesses = 20; + r2->nr_accesses_bp = 200000; damon_add_region(r2, t); damon_merge_two_regions(t, r, r2); @@ -196,6 +199,7 @@ static void damon_test_merge_regions_of(struct kunit *test) for (i = 0; i < ARRAY_SIZE(sa); i++) { r = damon_new_region(sa[i], ea[i]); r->nr_accesses = nrs[i]; + r->nr_accesses_bp = nrs[i] * 10000; damon_add_region(r, t); } @@ -297,6 +301,7 @@ static void damon_test_update_monitoring_result(struct kunit *test) struct damon_region *r = damon_new_region(3, 7); r->nr_accesses = 15; + r->nr_accesses_bp = 150000; r->age = 20; new_attrs = (struct damon_attrs){ diff --git a/mm/damon/core.c b/mm/damon/core.c index b005dc15009f..ce85c00b0a4c 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -128,6 +128,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) region->ar.start = start; region->ar.end = end; region->nr_accesses = 0; + region->nr_accesses_bp = 0; INIT_LIST_HEAD(®ion->list); region->age = 0; @@ -508,6 +509,7 @@ static void damon_update_monitoring_result(struct damon_region *r, { r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->nr_accesses * 10000; r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); } @@ -1115,6 +1117,7 @@ static void damon_merge_two_regions(struct damon_target *t, l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); + l->nr_accesses_bp = l->nr_accesses * 10000; l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r); l->ar.end = r->ar.end; damon_destroy_region(r, t); @@ -1138,6 +1141,8 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, else r->age++; + r->nr_accesses_bp = r->nr_accesses * 10000; + if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) @@ -1186,6 +1191,7 @@ static void damon_split_region_at(struct damon_target *t, new->age = r->age; new->last_nr_accesses = r->last_nr_accesses; + new->nr_accesses_bp = r->nr_accesses_bp; damon_insert_region(new, r, damon_next_region(r), t); } -- cgit v1.2.3 From ace30fb21af5f1be1605db72c16040b95b1557ef Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:49 +0000 Subject: mm/damon/core: use pseudo-moving sum for nr_accesses_bp Let nr_accesses_bp be calculated as a pseudo-moving sum that updated for every sampling interval, using damon_moving_sum(). This is assumed to be useful for cases that the aggregation interval is set quite huge, but the monivoting results need to be collected earlier than next aggregation interval is passed. Link: https://lkml.kernel.org/r/20230915025251.72816-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 12 +++++++++--- mm/damon/core.c | 16 +++++++++++++++- mm/damon/paddr.c | 9 +++++---- mm/damon/vaddr.c | 12 +++++++----- 4 files changed, 36 insertions(+), 13 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 15f24b23c9a0..0fe13482df63 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -40,7 +40,8 @@ struct damon_addr_range { * @ar: The address range of the region. * @sampling_addr: Address of the sample for the next access check. * @nr_accesses: Access frequency of this region. - * @nr_accesses_bp: @nr_accesses in basis point (0.01%). + * @nr_accesses_bp: @nr_accesses in basis point (0.01%) that updated for + * each sampling interval. * @list: List head for siblings. * @age: Age of this region. * @@ -51,7 +52,11 @@ struct damon_addr_range { * damon_update_region_access_rate(). * * @nr_accesses_bp is another representation of @nr_accesses in basis point - * (1 in 10,000) that updated every aggregation interval. + * (1 in 10,000) that updated for every &damon_attrs->sample_interval in a + * manner similar to moving sum. By the algorithm, this value becomes + * @nr_accesses * 10000 for every &struct damon_attrs->aggr_interval. This can + * be used when the aggregation interval is too huge and therefore cannot wait + * for it before getting the access monitoring results. * * @age is initially zero, increased for each aggregation interval, and reset * to zero again if the access frequency is significantly changed. If two @@ -629,7 +634,8 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, unsigned int len_window, unsigned int new_value); -void damon_update_region_access_rate(struct damon_region *r, bool accessed); +void damon_update_region_access_rate(struct damon_region *r, bool accessed, + struct damon_attrs *attrs); struct damos_filter *damos_new_filter(enum damos_filter_type type, bool matching); diff --git a/mm/damon/core.c b/mm/damon/core.c index ce85c00b0a4c..29ee1fc18393 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1599,14 +1599,28 @@ unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, * damon_update_region_access_rate() - Update the access rate of a region. * @r: The DAMON region to update for its access check result. * @accessed: Whether the region has accessed during last sampling interval. + * @attrs: The damon_attrs of the DAMON context. * * Update the access rate of a region with the region's last sampling interval * access check result. * * Usually this will be called by &damon_operations->check_accesses callback. */ -void damon_update_region_access_rate(struct damon_region *r, bool accessed) +void damon_update_region_access_rate(struct damon_region *r, bool accessed, + struct damon_attrs *attrs) { + unsigned int len_window = 1; + + /* + * sample_interval can be zero, but cannot be larger than + * aggr_interval, owing to validation of damon_set_attrs(). + */ + if (attrs->sample_interval) + len_window = attrs->aggr_interval / attrs->sample_interval; + r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp, + r->last_nr_accesses * 10000, len_window, + accessed ? 10000 : 0); + if (accessed) r->nr_accesses++; } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 44f21860b555..081e2a325778 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -148,7 +148,8 @@ out: return accessed; } -static void __damon_pa_check_access(struct damon_region *r) +static void __damon_pa_check_access(struct damon_region *r, + struct damon_attrs *attrs) { static unsigned long last_addr; static unsigned long last_folio_sz = PAGE_SIZE; @@ -157,12 +158,12 @@ static void __damon_pa_check_access(struct damon_region *r) /* If the region is in the last checked page, reuse the result */ if (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz)) { - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); return; } last_accessed = damon_pa_young(r->sampling_addr, &last_folio_sz); - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); last_addr = r->sampling_addr; } @@ -175,7 +176,7 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) { - __damon_pa_check_access(r); + __damon_pa_check_access(r, &ctx->attrs); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 02ab448d9b1e..a4d1f63c5b23 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -558,26 +558,27 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * r the region to be checked */ static void __damon_va_check_access(struct mm_struct *mm, - struct damon_region *r, bool same_target) + struct damon_region *r, bool same_target, + struct damon_attrs *attrs) { static unsigned long last_addr; static unsigned long last_folio_sz = PAGE_SIZE; static bool last_accessed; if (!mm) { - damon_update_region_access_rate(r, false); + damon_update_region_access_rate(r, false, attrs); return; } /* If the region is in the last checked page, reuse the result */ if (same_target && (ALIGN_DOWN(last_addr, last_folio_sz) == ALIGN_DOWN(r->sampling_addr, last_folio_sz))) { - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); return; } last_accessed = damon_va_young(mm, r->sampling_addr, &last_folio_sz); - damon_update_region_access_rate(r, last_accessed); + damon_update_region_access_rate(r, last_accessed, attrs); last_addr = r->sampling_addr; } @@ -594,7 +595,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) mm = damon_get_mm(t); same_target = false; damon_for_each_region(r, t) { - __damon_va_check_access(mm, r, same_target); + __damon_va_check_access(mm, r, same_target, + &ctx->attrs); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); same_target = true; } -- cgit v1.2.3 From 401807a316bb913f5eefcaf8343575ec9296d6b1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:50 +0000 Subject: mm/damon/core: skip updating nr_accesses_bp for each aggregation interval damon_merge_regions_of(), which is called for each aggregation interval, updates nr_accesses_bp to nr_accesses * 10000. However, nr_accesses_bp is updated for each sampling interval via damon_moving_sum() using the aggregation interval as the moving time window. And by the definition of the algorithm, the value becomes same to discrete-window based sum for each time window-aligned time. Hence, nr_accesses_bp will be same to nr_accesses * 10000 for each aggregation interval without explicit update. Remove the unnecessary update of nr_accesses_bp in damon_merge_regions_of(). Link: https://lkml.kernel.org/r/20230915025251.72816-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 29ee1fc18393..45cc108c0fe1 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1141,8 +1141,6 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, else r->age++; - r->nr_accesses_bp = r->nr_accesses * 10000; - if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) -- cgit v1.2.3 From 863803a7948c8e33e6a7b002017747ca83ecfd63 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 15 Sep 2023 02:52:51 +0000 Subject: mm/damon/core: mark damon_moving_sum() as a static function The function is used by only mm/damon/core.c. Mark it as a static function. Link: https://lkml.kernel.org/r/20230915025251.72816-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 -- mm/damon/core.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 0fe13482df63..491fdd3e4c76 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -632,8 +632,6 @@ void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); -unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, - unsigned int len_window, unsigned int new_value); void damon_update_region_access_rate(struct damon_region *r, bool accessed, struct damon_attrs *attrs); diff --git a/mm/damon/core.c b/mm/damon/core.c index 45cc108c0fe1..b15cf47d2d29 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1587,7 +1587,7 @@ int damon_set_region_biggest_system_ram_default(struct damon_target *t, * * Return: Pseudo-moving average after getting the @new_value. */ -unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, +static unsigned int damon_moving_sum(unsigned int mvsum, unsigned int nomvsum, unsigned int len_window, unsigned int new_value) { return mvsum - nomvsum / len_window + new_value; -- cgit v1.2.3 From a9e34ea1f62c1e359ed197ec01d056c25edb2b61 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:53:58 +0100 Subject: mm: hugetlb_vmemmap: use nid of the head page to reallocate it Patch series "mm: hugetlb: Skip initialization of gigantic tail struct pages if freed by HVO", v5. This series moves the boot time initialization of tail struct pages of a gigantic page to later on in the boot. Only the HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized at the start. If HVO is successful, then no more tail struct pages need to be initialized. For a 1G hugepage, this series avoid initialization of 262144 - 63 = 262081 struct pages per hugepage. When tested on a 512G system (allocating 500 1G hugepages), the kexec-boot times with DEFERRED_STRUCT_PAGE_INIT enabled are: - with patches, HVO enabled: 1.32 seconds - with patches, HVO disabled: 2.15 seconds - without patches, HVO enabled: 3.90 seconds - without patches, HVO disabled: 3.58 seconds This represents an approximately 70% reduction in boot time and will significantly reduce server downtime when using a large number of gigantic pages. This patch (of 4): If tail page prep and initialization is skipped, then the "start" page will not contain the correct nid. Use the nid from first vmemap page. Link: https://lkml.kernel.org/r/20230913105401.519709-1-usama.arif@bytedance.com Link: https://lkml.kernel.org/r/20230913105401.519709-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 0bde38626d25..ad650c7b07ec 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -319,7 +319,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, }; - int nid = page_to_nid((struct page *)start); + int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* -- cgit v1.2.3 From ee8d2071ef52d83a2ac4f8a474fafb2aea91766d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:53:59 +0100 Subject: memblock: pass memblock_type to memblock_setclr_flag This allows setting flags to both memblock types and is in preparation for setting flags (for e.g. to not initialize struct pages) on reserved memory region. [usama.arif@bytedance.com: add missing argument definition] Link: https://lkml.kernel.org/r/20230918090657.220463-1-usama.arif@bytedance.com Link: https://lkml.kernel.org/r/20230913105401.519709-3-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Rapoport (IBM) Acked-by: Mike Kravetz Cc: Fam Zheng Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memblock.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memblock.c b/mm/memblock.c index 913b2520a9a0..b978cda96cf0 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -892,6 +892,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) /** * memblock_setclr_flag - set or clear flag for a memory region + * @type: memblock type to set/clear flag for * @base: base address of the region * @size: size of the region * @set: set or clear the flag @@ -901,10 +902,9 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) * * Return: 0 on success, -errno on failure. */ -static int __init_memblock memblock_setclr_flag(phys_addr_t base, - phys_addr_t size, int set, int flag) +static int __init_memblock memblock_setclr_flag(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, int set, int flag) { - struct memblock_type *type = &memblock.memory; int i, ret, start_rgn, end_rgn; ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); @@ -933,7 +933,7 @@ static int __init_memblock memblock_setclr_flag(phys_addr_t base, */ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG); } /** @@ -945,7 +945,7 @@ int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_HOTPLUG); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG); } /** @@ -962,7 +962,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) system_has_some_mirror = true; - return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR); } /** @@ -982,7 +982,7 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP); } /** @@ -994,7 +994,7 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) */ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) { - return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); + return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); } static bool should_skip_region(struct memblock_type *type, -- cgit v1.2.3 From 77e6c43e137c130138c3fbadc847351a83c4befe Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:00 +0100 Subject: memblock: introduce MEMBLOCK_RSRV_NOINIT flag For reserved memory regions marked with this flag, reserve_bootmem_region is not called during memmap_init_reserved_pages. This can be used to avoid struct page initialization for regions which won't need them, for e.g. hugepages with Hugepage Vmemmap Optimization enabled. Link: https://lkml.kernel.org/r/20230913105401.519709-4-usama.arif@bytedance.com Signed-off-by: Usama Arif Acked-by: Muchun Song Reviewed-by: Mike Rapoport (IBM) Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- include/linux/memblock.h | 9 +++++++++ mm/memblock.c | 33 ++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 1c1072e3ca06..ae3bde302f70 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -40,6 +40,8 @@ extern unsigned long long max_possible_pfn; * via a driver, and never indicated in the firmware-provided memory map as * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the * kernel resource tree. + * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are + * not initialized (only for reserved regions). */ enum memblock_flags { MEMBLOCK_NONE = 0x0, /* No special request */ @@ -47,6 +49,7 @@ enum memblock_flags { MEMBLOCK_MIRROR = 0x2, /* mirrored region */ MEMBLOCK_NOMAP = 0x4, /* don't add to kernel direct mapping */ MEMBLOCK_DRIVER_MANAGED = 0x8, /* always detected via a driver */ + MEMBLOCK_RSRV_NOINIT = 0x10, /* don't initialize struct pages */ }; /** @@ -125,6 +128,7 @@ int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); +int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); void memblock_free_all(void); void memblock_free(void *ptr, size_t size); @@ -259,6 +263,11 @@ static inline bool memblock_is_nomap(struct memblock_region *m) return m->flags & MEMBLOCK_NOMAP; } +static inline bool memblock_is_reserved_noinit(struct memblock_region *m) +{ + return m->flags & MEMBLOCK_RSRV_NOINIT; +} + static inline bool memblock_is_driver_managed(struct memblock_region *m) { return m->flags & MEMBLOCK_DRIVER_MANAGED; diff --git a/mm/memblock.c b/mm/memblock.c index b978cda96cf0..fd492e5bbdbc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -997,6 +997,24 @@ int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP); } +/** + * memblock_reserved_mark_noinit - Mark a reserved memory region with flag + * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized + * for this region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * struct pages will not be initialized for reserved memory regions marked with + * %MEMBLOCK_RSRV_NOINIT. + * + * Return: 0 on success, -errno on failure. + */ +int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(&memblock.reserved, base, size, 1, + MEMBLOCK_RSRV_NOINIT); +} + static bool should_skip_region(struct memblock_type *type, struct memblock_region *m, int nid, int flags) @@ -2113,13 +2131,18 @@ static void __init memmap_init_reserved_pages(void) memblock_set_node(start, end, &memblock.reserved, nid); } - /* initialize struct pages for the reserved regions */ + /* + * initialize struct pages for reserved regions that don't have + * the MEMBLOCK_RSRV_NOINIT flag set + */ for_each_reserved_mem_region(region) { - nid = memblock_get_region_node(region); - start = region->base; - end = start + region->size; + if (!memblock_is_reserved_noinit(region)) { + nid = memblock_get_region_node(region); + start = region->base; + end = start + region->size; - reserve_bootmem_region(start, end, nid); + reserve_bootmem_region(start, end, nid); + } } } -- cgit v1.2.3 From fde1c4ecf91640e5a95ec36b71ec2e8ec379ce40 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 13 Sep 2023 11:54:01 +0100 Subject: mm: hugetlb: skip initialization of gigantic tail struct pages if freed by HVO The new boot flow when it comes to initialization of gigantic pages is as follows: - At boot time, for a gigantic page during __alloc_bootmem_hugepage, the region after the first struct page is marked as noinit. - This results in only the first struct page to be initialized in reserve_bootmem_region. As the tail struct pages are not initialized at this point, there can be a significant saving in boot time if HVO succeeds later on. - Later on in the boot, the head page is prepped and the first HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page) - 1 tail struct pages are initialized. - HVO is attempted. If it is not successful, then the rest of the tail struct pages are initialized. If it is successful, no more tail struct pages need to be initialized saving significant boot time. The WARN_ON for increased ref count in gather_bootmem_prealloc was changed to a VM_BUG_ON. This is OK as there should be no speculative references this early in boot process. The VM_BUG_ON's are there just in case such code is introduced. [akpm@linux-foundation.org: make it nicer for 80 cols] Link: https://lkml.kernel.org/r/20230913105401.519709-5-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Reviewed-by: Mike Kravetz Cc: Fam Zheng Cc: Mike Rapoport (IBM) Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 68 +++++++++++++++++++++++++++++++++++++++++++++------- mm/hugetlb_vmemmap.c | 2 +- mm/hugetlb_vmemmap.h | 9 +++---- mm/internal.h | 3 +++ mm/mm_init.c | 2 +- 5 files changed, 69 insertions(+), 15 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a945efe2858a..4e276466d6aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3169,6 +3169,16 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) } found: + + /* + * Only initialize the head struct page in memmap_init_reserved_pages, + * rest of the struct pages will be initialized by the HugeTLB + * subsystem itself. + * The head struct page is used to get folio information by the HugeTLB + * subsystem like zone id and node id. + */ + memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE), + huge_page_size(h) - PAGE_SIZE); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -3176,6 +3186,43 @@ found: return 1; } +/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */ +static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, + unsigned long start_page_number, + unsigned long end_page_number) +{ + enum zone_type zone = zone_idx(folio_zone(folio)); + int nid = folio_nid(folio); + unsigned long head_pfn = folio_pfn(folio); + unsigned long pfn, end_pfn = head_pfn + end_page_number; + int ret; + + for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) { + struct page *page = pfn_to_page(pfn); + + __init_single_page(page, pfn, zone, nid); + prep_compound_tail((struct page *)folio, pfn - head_pfn); + ret = page_ref_freeze(page, 1); + VM_BUG_ON(!ret); + } +} + +static void __init hugetlb_folio_init_vmemmap(struct folio *folio, + struct hstate *h, + unsigned long nr_pages) +{ + int ret; + + /* Prepare folio head */ + __folio_clear_reserved(folio); + __folio_set_head(folio); + ret = page_ref_freeze(&folio->page, 1); + VM_BUG_ON(!ret); + /* Initialize the necessary tail struct pages */ + hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); + prep_compound_head((struct page *)folio, huge_page_order(h)); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3186,19 +3233,21 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); - struct folio *folio = page_folio(page); + struct folio *folio = (void *)page; struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); - if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { - WARN_ON(folio_test_reserved(folio)); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_folio(folio); /* add to the hugepage allocator */ - } else { - /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_folio(folio, huge_page_order(h)); - } + + hugetlb_folio_init_vmemmap(folio, h, + HUGETLB_VMEMMAP_RESERVE_PAGES); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + /* If HVO fails, initialize all tail struct pages */ + if (!HPageVmemmapOptimized(&folio->page)) + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + free_huge_folio(folio); /* add to the hugepage allocator */ /* * We need to restore the 'stolen' pages to totalram_pages @@ -3209,6 +3258,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } + static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ad650c7b07ec..76682d1d79a7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -588,7 +588,7 @@ static int __init hugetlb_vmemmap_init(void) const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ - BUILD_BUG_ON(__NR_USED_SUBPAGE * sizeof(struct page) > HUGETLB_VMEMMAP_RESERVE_SIZE); + BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 25bd0e002431..4573899855d7 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -10,15 +10,16 @@ #define _LINUX_HUGETLB_VMEMMAP_H #include -#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); - /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See * Documentation/vm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE +#define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { diff --git a/mm/internal.h b/mm/internal.h index a273f4d948d8..f7c963dfbdb3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1155,6 +1155,9 @@ struct vma_prepare { struct vm_area_struct *remove2; }; +void __meminit __init_single_page(struct page *page, unsigned long pfn, + unsigned long zone, int nid); + /* shrinker related functions */ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); diff --git a/mm/mm_init.c b/mm/mm_init.c index 6be6f50813b1..077bfe393b5e 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -555,7 +555,7 @@ out: node_states[N_MEMORY] = saved_node_state; } -static void __meminit __init_single_page(struct page *page, unsigned long pfn, +void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { mm_zero_struct_page(page); -- cgit v1.2.3 From 3ec145f9d01e799bc7e41277e571141546b850f3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:23 +0100 Subject: hugetlb: use a folio in free_hpage_workfn() Patch series "Small hugetlb cleanups", v2. Some trivial folio conversions This patch (of 3): update_and_free_hugetlb_folio puts the memory on hpage_freelist as a folio so we can take it off the list as a folio. Link: https://lkml.kernel.org/r/20230824141325.2704553-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230824141325.2704553-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4e276466d6aa..7a6fe4f13777 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,22 +1787,22 @@ static void free_hpage_workfn(struct work_struct *work) node = llist_del_all(&hpage_freelist); while (node) { - struct page *page; + struct folio *folio; struct hstate *h; - page = container_of((struct address_space **)node, - struct page, mapping); + folio = container_of((struct address_space **)node, + struct folio, mapping); node = node->next; - page->mapping = NULL; + folio->mapping = NULL; /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will clear the hugetlb bit, so do * not use folio_hstate() directly. */ - h = size_to_hstate(page_size(page)); + h = size_to_hstate(folio_size(folio)); - __update_and_free_hugetlb_folio(h, page_folio(page)); + __update_and_free_hugetlb_folio(h, folio); cond_resched(); } -- cgit v1.2.3 From 04bbfd844b99c7c89a344236d3758e7a2d41572f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:24 +0100 Subject: hugetlb: remove a few calls to page_folio() Anything found on a linked list threaded through ->lru is guaranteed to be a folio as the compound_head found in a tail page overlaps the ->lru member of struct page. So we can pull folios directly off these lists no matter whether pages or folios were added to the list. Link: https://lkml.kernel.org/r/20230824141325.2704553-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a6fe4f13777..e91ce966a2c9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1836,11 +1836,9 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { - struct page *page, *t_page; - struct folio *folio; + struct folio *folio, *t_folio; - list_for_each_entry_safe(page, t_page, list, lru) { - folio = page_folio(page); + list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } @@ -2229,8 +2227,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, bool acct_surplus) { int nr_nodes, node; - struct page *page = NULL; - struct folio *folio; + struct folio *folio = NULL; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2240,15 +2237,14 @@ static struct page *remove_pool_huge_page(struct hstate *h, */ if ((!acct_surplus || h->surplus_huge_pages_node[node]) && !list_empty(&h->hugepage_freelists[node])) { - page = list_entry(h->hugepage_freelists[node].next, - struct page, lru); - folio = page_folio(page); + folio = list_entry(h->hugepage_freelists[node].next, + struct folio, lru); remove_hugetlb_folio(h, folio, acct_surplus); break; } } - return page; + return &folio->page; } /* @@ -3414,15 +3410,15 @@ static void try_to_free_low(struct hstate *h, unsigned long count, * Collect pages to be freed on a list, and free after dropping lock */ for_each_node_mask(i, *nodes_allowed) { - struct page *page, *next; + struct folio *folio, *next; struct list_head *freel = &h->hugepage_freelists[i]; - list_for_each_entry_safe(page, next, freel, lru) { + list_for_each_entry_safe(folio, next, freel, lru) { if (count >= h->nr_huge_pages) goto out; - if (PageHighMem(page)) + if (folio_test_highmem(folio)) continue; - remove_hugetlb_folio(h, page_folio(page), false); - list_add(&page->lru, &page_list); + remove_hugetlb_folio(h, folio, false); + list_add(&folio->lru, &page_list); } } -- cgit v1.2.3 From d5b43e9683ec5c78524f9ae93c1824edcbc1f08d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 24 Aug 2023 15:13:25 +0100 Subject: hugetlb: convert remove_pool_huge_page() to remove_pool_hugetlb_folio() Convert the callers to expect a folio and remove the unnecesary conversion back to a struct page. Link: https://lkml.kernel.org/r/20230824141325.2704553-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91ce966a2c9..de220e3ff8be 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1446,7 +1446,7 @@ static int hstate_next_node_to_alloc(struct hstate *h, } /* - * helper for remove_pool_huge_page() - return the previously saved + * helper for remove_pool_hugetlb_folio() - return the previously saved * node ["this node"] from which to free a huge page. Advance the * next node id whether or not we find a free huge page to free so * that the next attempt to free addresses the next node. @@ -2222,9 +2222,8 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * an additional call to free the page to low level allocators. * Called with hugetlb_lock locked. */ -static struct page *remove_pool_huge_page(struct hstate *h, - nodemask_t *nodes_allowed, - bool acct_surplus) +static struct folio *remove_pool_hugetlb_folio(struct hstate *h, + nodemask_t *nodes_allowed, bool acct_surplus) { int nr_nodes, node; struct folio *folio = NULL; @@ -2244,7 +2243,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, } } - return &folio->page; + return folio; } /* @@ -2598,7 +2597,6 @@ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { unsigned long nr_pages; - struct page *page; LIST_HEAD(page_list); lockdep_assert_held(&hugetlb_lock); @@ -2619,15 +2617,17 @@ static void return_unused_surplus_pages(struct hstate *h, * evenly across all nodes with memory. Iterate across these nodes * until we can no longer free unreserved surplus pages. This occurs * when the nodes with surplus pages have no free pages. - * remove_pool_huge_page() will balance the freed pages across the + * remove_pool_hugetlb_folio() will balance the freed pages across the * on-line nodes with memory and will handle the hstate accounting. */ while (nr_pages--) { - page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1); + if (!folio) goto out; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } out: @@ -3472,7 +3472,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; - struct page *page; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3594,11 +3593,13 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - page = remove_pool_huge_page(h, nodes_allowed, 0); - if (!page) + struct folio *folio; + + folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); + if (!folio) break; - list_add(&page->lru, &page_list); + list_add(&folio->lru, &page_list); } /* free the pages after dropping lock */ spin_unlock_irq(&hugetlb_lock); -- cgit v1.2.3 From affa87c708185cab194099ee51b946ef0297f063 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:37 +0000 Subject: mm/damon/core: make DAMOS uses nr_accesses_bp instead of nr_accesses Patch series "mm/damon: implement DAMOS apply intervals". DAMON-based operation schemes are applied for every aggregation interval. That is mainly because schemes are using nr_accesses, which be complete to be used for every aggregation interval. This makes some DAMOS use cases be tricky. Quota setting under long aggregation interval is one such example. Suppose the aggregation interval is ten seconds, and there is a scheme having CPU quota 100ms per 1s. The scheme will actually uses 100ms per ten seconds, since it cannobe be applied before next aggregation interval. The feature is working as intended, but the results might not that intuitive for some users. This could be fixed by updating the quota to 1s per 10s. But, in the case, the CPU usage of DAMOS could look like spikes, and actually make a bad effect to other CPU-sensitive workloads. Also, with such huge aggregation interval, users may want schemes to be applied more frequently. DAMON provides nr_accesses_bp, which is updated for each sampling interval in a way that reasonable to be used. By using that instead of nr_accesses, DAMOS can have its own time interval and mitigate abovely mentioned issues. This patchset makes DAMOS schemes to use nr_accesses_bp instead of nr_accesses, and have their own timing intervals. Also update DAMOS tried regions sysfs files and DAMOS before_apply tracepoint to use the new data as their source. Note that the interval is zero by default, and it is interpreted to use the aggregation interval instead. This avoids making user-visible behavioral changes. Patches Seuqeunce ----------------- The first patch (patch 1/9) makes DAMOS uses nr_accesses_bp instead of nr_accesses, and following two patches (patches 2/9 and 3/9) updates DAMON sysfs interface for DAMOS tried regions and the DAMOS before_apply tracespoint to use nr_accesses_bp instead of nr_accesses, respectively. The following two patches (patches 4/9 and 5/9) implements the scheme-specific apply interval for DAMON kernel API users and update the design document for the new feature. Finally, the following four patches (patches 6/9, 7/9, 8/9 and 9/9) add support of the feature in DAMON sysfs interface, add a simple selftest test case, and document the new file on the usage and the ABI documents, repsectively. This patch (of 9): DAMON provides nr_accesses_bp, which becomes same to nr_accesses * 10000 for every aggregation interval, but updated every sampling interval with a reasonable accuracy. Since DAMON-based operation schemes are applied in every aggregation interval using nr_accesses, using nr_accesses_bp instead will make no difference to users. Meanwhile, it allows DAMOS to apply the schemes in a time interval that less than the aggregation interval. It could be useful and more flexible for some cases. Do it. Link: https://lkml.kernel.org/r/20230916020945.47296-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230916020945.47296-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index b15cf47d2d29..79fef5145a4b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -770,12 +770,13 @@ static void damon_split_region_at(struct damon_target *t, static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; + unsigned int nr_accesses = r->nr_accesses_bp / 10000; sz = damon_sz_region(r); return s->pattern.min_sz_region <= sz && sz <= s->pattern.max_sz_region && - s->pattern.min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_nr_accesses <= nr_accesses && + nr_accesses <= s->pattern.max_nr_accesses && s->pattern.min_age_region <= r->age && r->age <= s->pattern.max_age_region; } -- cgit v1.2.3 From e7639bb48d39dd7f544b19c8a09bb62f934b6e32 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:38 +0000 Subject: mm/damon/sysfs-schemes: use nr_accesses_bp as the source of tried_regions//nr_accesses DAMON sysfs interface exposes access rate of each region via DAMOS tried regions directory. For this, the nr_accesses field of the region is used. DAMOS was actually using nr_accesses in the past, but it uses nr_accesses_bp now. Use the value that it is really using as the source. Note that this doesn't expose nr_accesses_bp as is (in basis point), but after converting it to the natural number by dividing the value by 10,000. Hence there is no behavioral change from users' perspective. Link: https://lkml.kernel.org/r/20230916020945.47296-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 527e7d17eb3b..093700f50b18 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -31,7 +31,7 @@ static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( return NULL; sysfs_region->kobj = (struct kobject){}; sysfs_region->ar = region->ar; - sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->nr_accesses = region->nr_accesses_bp / 10000; sysfs_region->age = region->age; INIT_LIST_HEAD(&sysfs_region->list); return sysfs_region; -- cgit v1.2.3 From a72217ad596ec8a957e1f73e45817c3664d99f1e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:39 +0000 Subject: mm/damon/core: use nr_accesses_bp as a source of damos_before_apply tracepoint damos_before_apply tracepoint is exposing access rate of DAMON regions using nr_accesses field of regions, which was actually used by DAMOS in the past. However, it has changed to use nr_accesses_bp instead. Update the tracepoint to expose the value that DAMOS is really using. Note that it doesn't expose the value as is in the basis point, but after converting it to the natural number by dividing it by 10,000. Therefore this change doesn't make user-visible behavioral differences. Link: https://lkml.kernel.org/r/20230916020945.47296-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/damon.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 19930bb7af9a..23200aabccac 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -36,7 +36,7 @@ TRACE_EVENT_CONDITION(damos_before_apply, __entry->target_idx = target_idx; __entry->start = r->ar.start; __entry->end = r->ar.end; - __entry->nr_accesses = r->nr_accesses; + __entry->nr_accesses = r->nr_accesses_bp / 10000; __entry->age = r->age; __entry->nr_regions = nr_regions; ), -- cgit v1.2.3 From 42f994b71404b17abcd6b170de7a6aa95ffe5d4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:40 +0000 Subject: mm/damon/core: implement scheme-specific apply interval DAMON-based operation schemes are applied for every aggregation interval. That was mainly because schemes were using nr_accesses, which be complete to be used for every aggregation interval. However, the schemes are now using nr_accesses_bp, which is updated for each sampling interval in a way that reasonable to be used. Therefore, there is no reason to apply schemes for each aggregation interval. The unnecessary alignment with aggregation interval was also making some use cases of DAMOS tricky. Quotas setting under long aggregation interval is one such example. Suppose the aggregation interval is ten seconds, and there is a scheme having CPU quota 100ms per 1s. The scheme will actually uses 100ms per ten seconds, since it cannobe be applied before next aggregation interval. The feature is working as intended, but the results might not that intuitive for some users. This could be fixed by updating the quota to 1s per 10s. But, in the case, the CPU usage of DAMOS could look like spikes, and would actually make a bad effect to other CPU-sensitive workloads. Implement a dedicated timing interval for each DAMON-based operation scheme, namely apply_interval. The interval will be sampling interval aligned, and each scheme will be applied for its apply_interval. The interval is set to 0 by default, and it means the scheme should use the aggregation interval instead. This avoids old users getting any behavioral difference. Link: https://lkml.kernel.org/r/20230916020945.47296-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/linux/damon.h | 17 ++++++++++-- mm/damon/core.c | 72 +++++++++++++++++++++++++++++++++++++++++++----- mm/damon/dbgfs.c | 3 +- mm/damon/lru_sort.c | 2 ++ mm/damon/reclaim.c | 2 ++ mm/damon/sysfs-schemes.c | 2 +- 6 files changed, 87 insertions(+), 11 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 491fdd3e4c76..27b995c22497 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -314,16 +314,19 @@ struct damos_access_pattern { * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. + * @apply_interval_us: The time between applying the @action. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. * - * For each aggregation interval, DAMON finds regions which fit in the + * For each @apply_interval_us, DAMON finds regions which fit in the * &pattern and applies &action to those. To avoid consuming too much * CPU time or IO resources for the &action, "a is used. * + * If @apply_interval_us is zero, &damon_attrs->aggr_interval is used instead. + * * To do the work only when needed, schemes can be activated for specific * system situations using &wmarks. If all schemes that registered to the * monitoring context are inactive, DAMON stops monitoring either, and just @@ -340,6 +343,14 @@ struct damos_access_pattern { struct damos { struct damos_access_pattern pattern; enum damos_action action; + unsigned long apply_interval_us; +/* private: internal use only */ + /* + * number of sample intervals that should be passed before applying + * @action + */ + unsigned long next_apply_sis; +/* public: */ struct damos_quota quota; struct damos_watermarks wmarks; struct list_head filters; @@ -641,7 +652,9 @@ void damos_add_filter(struct damos *s, struct damos_filter *f); void damos_destroy_filter(struct damos_filter *f); struct damos *damon_new_scheme(struct damos_access_pattern *pattern, - enum damos_action action, struct damos_quota *quota, + enum damos_action action, + unsigned long apply_interval_us, + struct damos_quota *quota, struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 79fef5145a4b..5eb649bd002f 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -313,7 +313,9 @@ static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) } struct damos *damon_new_scheme(struct damos_access_pattern *pattern, - enum damos_action action, struct damos_quota *quota, + enum damos_action action, + unsigned long apply_interval_us, + struct damos_quota *quota, struct damos_watermarks *wmarks) { struct damos *scheme; @@ -323,6 +325,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return NULL; scheme->pattern = *pattern; scheme->action = action; + scheme->apply_interval_us = apply_interval_us; + /* + * next_apply_sis will be set when kdamond starts. While kdamond is + * running, it will also updated when it is added to the DAMON context, + * or damon_attrs are updated. + */ + scheme->next_apply_sis = 0; INIT_LIST_HEAD(&scheme->filters); scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -335,9 +344,21 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, return scheme; } +static void damos_set_next_apply_sis(struct damos *s, struct damon_ctx *ctx) +{ + unsigned long sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + unsigned long apply_interval = s->apply_interval_us ? + s->apply_interval_us : ctx->attrs.aggr_interval; + + s->next_apply_sis = ctx->passed_sample_intervals + + apply_interval / sample_interval; +} + void damon_add_scheme(struct damon_ctx *ctx, struct damos *s) { list_add_tail(&s->list, &ctx->schemes); + damos_set_next_apply_sis(s, ctx); } static void damon_del_scheme(struct damos *s) @@ -558,6 +579,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { unsigned long sample_interval = attrs->sample_interval ? attrs->sample_interval : 1; + struct damos *s; if (attrs->min_nr_regions < 3) return -EINVAL; @@ -573,6 +595,10 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) damon_update_monitoring_results(ctx, attrs); ctx->attrs = *attrs; + + damon_for_each_scheme(s, ctx) + damos_set_next_apply_sis(s, ctx); + return 0; } @@ -1094,14 +1120,29 @@ static void kdamond_apply_schemes(struct damon_ctx *c) struct damon_target *t; struct damon_region *r, *next_r; struct damos *s; + unsigned long sample_interval = c->attrs.sample_interval ? + c->attrs.sample_interval : 1; + bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { + if (c->passed_sample_intervals != s->next_apply_sis) + continue; + + s->next_apply_sis += + (s->apply_interval_us ? s->apply_interval_us : + c->attrs.aggr_interval) / sample_interval; + if (!s->wmarks.activated) continue; + has_schemes_to_apply = true; + damos_adjust_quota(c, s); } + if (!has_schemes_to_apply) + return; + damon_for_each_target(t, c) { damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); @@ -1372,11 +1413,19 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx) { unsigned long sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; + unsigned long apply_interval; + struct damos *scheme; ctx->passed_sample_intervals = 0; ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / sample_interval; + + damon_for_each_scheme(scheme, ctx) { + apply_interval = scheme->apply_interval_us ? + scheme->apply_interval_us : ctx->attrs.aggr_interval; + scheme->next_apply_sis = apply_interval / sample_interval; + } } /* @@ -1428,19 +1477,28 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - sample_interval = ctx->attrs.sample_interval ? - ctx->attrs.sample_interval : 1; if (ctx->passed_sample_intervals == next_aggregation_sis) { - ctx->next_aggregation_sis = next_aggregation_sis + - ctx->attrs.aggr_interval / sample_interval; kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && ctx->callback.after_aggregation(ctx)) break; - if (!list_empty(&ctx->schemes)) - kdamond_apply_schemes(ctx); + } + + /* + * do kdamond_apply_schemes() after kdamond_merge_regions() if + * possible, to reduce overhead + */ + if (!list_empty(&ctx->schemes)) + kdamond_apply_schemes(ctx); + + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + if (ctx->passed_sample_intervals == next_aggregation_sis) { + ctx->next_aggregation_sis = next_aggregation_sis + + ctx->attrs.aggr_interval / sample_interval; + kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); if (ctx->ops.reset_aggregated) diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 124f0f8c97b7..dc0ea1fc30ca 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -278,7 +278,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(&pattern, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, 0, "a, + &wmarks); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 7b8fce2f67a8..3ecdcc029443 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -158,6 +158,8 @@ static struct damos *damon_lru_sort_new_scheme( pattern, /* (de)prioritize on LRU-lists */ action, + /* for each aggregation interval */ + 0, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 648d2a85523a..ab974e477d2f 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -142,6 +142,8 @@ static struct damos *damon_reclaim_new_scheme(void) &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, + /* for each aggregation interval */ + 0, /* under the quota. */ &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 093700f50b18..3d30e85596b0 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1610,7 +1610,7 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, "a, + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, 0, "a, &wmarks); if (!scheme) return NULL; -- cgit v1.2.3 From 3f8723f12990d38ed800570fb77b92cf4eff54c8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:41 +0000 Subject: Docs/mm/damon/design: document DAMOS apply intervals Update DAMON design doc to explain about DAMOS apply intervals. Link: https://lkml.kernel.org/r/20230916020945.47296-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 18e9b42673f8..1f7e0586b5fa 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -259,7 +259,8 @@ works, DAMON provides a feature called Data Access Monitoring-based Operation Schemes (DAMOS). It lets users specify their desired schemes at a high level. For such specifications, DAMON starts monitoring, finds regions having the access pattern of interest, and applies the user-desired operation actions -to the regions as soon as found. +to the regions, for every user-specified time interval called +``apply_interval``. .. _damon_design_damos_action: -- cgit v1.2.3 From a2a9f68e358fa9627aa72e7182ad0b82846bda9e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:42 +0000 Subject: mm/damon/sysfs-schemes: support DAMOS apply interval Update DAMON sysfs interface to support DAMOS apply intervals by adding a new file, 'apply_interval_us' in each scheme directory. Users can set and get the interval for each scheme in microseconds by writing to and reading from the file. Link: https://lkml.kernel.org/r/20230916020945.47296-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 3d30e85596b0..a7d70b95c4dd 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1121,6 +1121,7 @@ struct damon_sysfs_scheme { struct kobject kobj; enum damos_action action; struct damon_sysfs_access_pattern *access_pattern; + unsigned long apply_interval_us; struct damon_sysfs_quotas *quotas; struct damon_sysfs_watermarks *watermarks; struct damon_sysfs_scheme_filters *filters; @@ -1141,7 +1142,7 @@ static const char * const damon_sysfs_damos_action_strs[] = { }; static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) + enum damos_action action, unsigned long apply_interval_us) { struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); @@ -1150,6 +1151,7 @@ static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( return NULL; scheme->kobj = (struct kobject){}; scheme->action = action; + scheme->apply_interval_us = apply_interval_us; return scheme; } @@ -1353,6 +1355,25 @@ static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, return -EINVAL; } +static ssize_t apply_interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%lu\n", scheme->apply_interval_us); +} + +static ssize_t apply_interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + int err = kstrtoul(buf, 0, &scheme->apply_interval_us); + + return err ? err : count; +} + static void damon_sysfs_scheme_release(struct kobject *kobj) { kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); @@ -1361,8 +1382,12 @@ static void damon_sysfs_scheme_release(struct kobject *kobj) static struct kobj_attribute damon_sysfs_scheme_action_attr = __ATTR_RW_MODE(action, 0600); +static struct kobj_attribute damon_sysfs_scheme_apply_interval_us_attr = + __ATTR_RW_MODE(apply_interval_us, 0600); + static struct attribute *damon_sysfs_scheme_attrs[] = { &damon_sysfs_scheme_action_attr.attr, + &damon_sysfs_scheme_apply_interval_us_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme); @@ -1413,7 +1438,11 @@ static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, schemes->schemes_arr = schemes_arr; for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + /* + * apply_interval_us as 0 means same to aggregation interval + * (same to before-apply_interval behavior) + */ + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT, 0); if (!scheme) { damon_sysfs_schemes_rm_dirs(schemes); return -ENOMEM; @@ -1610,8 +1639,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - scheme = damon_new_scheme(&pattern, sysfs_scheme->action, 0, "a, - &wmarks); + scheme = damon_new_scheme(&pattern, sysfs_scheme->action, + sysfs_scheme->apply_interval_us, "a, &wmarks); if (!scheme) return NULL; @@ -1641,6 +1670,7 @@ static void damon_sysfs_update_scheme(struct damos *scheme, scheme->pattern.max_age_region = access_pattern->age->max; scheme->action = sysfs_scheme->action; + scheme->apply_interval_us = sysfs_scheme->apply_interval_us; scheme->quota.ms = sysfs_quotas->ms; scheme->quota.sz = sysfs_quotas->sz; -- cgit v1.2.3 From 65ded14e2818bdd9b9bdc128cf47122533f46778 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:43 +0000 Subject: selftests/damon/sysfs: test DAMOS apply intervals Update DAMON selftests to test existence of the file for reading/writing DAMOS apply interval under each scheme directory. Link: https://lkml.kernel.org/r/20230916020945.47296-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 60a9a305aef0..56f0230a8b92 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -175,6 +175,7 @@ test_scheme() ensure_dir "$scheme_dir" "exist" ensure_file "$scheme_dir/action" "exist" "600" test_access_pattern "$scheme_dir/access_pattern" + ensure_file "$scheme_dir/apply_interval_us" "exist" "600" test_quotas "$scheme_dir/quotas" test_watermarks "$scheme_dir/watermarks" test_filters "$scheme_dir/filters" -- cgit v1.2.3 From 033343d5c5b06431b83fb0c7d1af4aaebe0be75e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:44 +0000 Subject: Docs/admin-guide/mm/damon/usage: update for DAMOS apply intervals Update DAMON usage document's DAMON sysfs interface section for the newly added DAMOS apply intervals support (apply_interval_us file). Link: https://lkml.kernel.org/r/20230916020945.47296-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 6272cd36590a..8507a6e45d86 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -76,7 +76,7 @@ comma (","). :: │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... │ │ │ │ │ schemes/nr_schemes - │ │ │ │ │ │ 0/action + │ │ │ │ │ │ 0/action,apply_interval_us │ │ │ │ │ │ │ access_pattern/ │ │ │ │ │ │ │ │ sz/min,max │ │ │ │ │ │ │ │ nr_accesses/min,max @@ -269,8 +269,8 @@ schemes// ------------ In each scheme directory, five directories (``access_pattern``, ``quotas``, -``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and one file -(``action``) exist. +``watermarks``, ``filters``, ``stats``, and ``tried_regions``) and two files +(``action`` and ``apply_interval``) exist. The ``action`` file is for setting and getting the scheme's :ref:`action `. The keywords that can be written to and read @@ -296,6 +296,9 @@ Note that support of each action depends on the running DAMON operations set - ``stat``: Do nothing but count the statistics. Supported by all operations sets. +The ``apply_interval_us`` file is for setting and getting the scheme's +:ref:`apply_interval ` in microseconds. + schemes//access_pattern/ --------------------------- -- cgit v1.2.3 From d57d36b56d66835411fe11332eeaa37e3f30173b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 16 Sep 2023 02:09:45 +0000 Subject: Docs/ABI/damon: update for DAMOS apply intervals Update DAMON ABI document for the newly added DAMON sysfs file for DAMOS apply intervals (apply_interval_us file). Link: https://lkml.kernel.org/r/20230916020945.47296-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 420b30f09cf0..b35649a46a2f 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -151,6 +151,13 @@ Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the action of the scheme. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//apply_interval_us +Date: Sep 2023 +Contact: SeongJae Park +Description: Writing a value to this file sets the action apply interval of + the scheme in microseconds. Reading this file returns the + value. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//access_pattern/sz/min Date: Mar 2022 Contact: SeongJae Park -- cgit v1.2.3 From ab428b4c459e62df7dab3b1b783ea03ea06ca895 Mon Sep 17 00:00:00 2001 From: Jianguo Bao Date: Sun, 17 Sep 2023 23:04:01 +0800 Subject: mm/writeback: update filemap_dirty_folio() comment Change to use new address space operation dirty_folio(). Link: https://lkml.kernel.org/r/20230917-trycontrib1-v1-1-db22630b8839@gmail.com Fixes: 6f31a5a261db ("fs: Add aops->dirty_folio") Signed-off-by: Jianguo Bau Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b8d3d7040a50..001adbb4a180 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2679,7 +2679,7 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, * @folio: Folio to be marked as dirty. * * Filesystems which do not use buffer heads should call this function - * from their set_page_dirty address space operation. It ignores the + * from their dirty_folio address space operation. It ignores the * contents of folio_get_private(), so if the filesystem marks individual * blocks as dirty, the filesystem should handle that itself. * -- cgit v1.2.3 From f950fa6ec6d2c058c64d364df5a460d1a0f16e96 Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 18 Sep 2023 20:09:50 +0800 Subject: mm/damon/core-test: fix memory leak in damon_new_region() Patch series "mm/damon/core-test: Fix memory leaks in core-test", v3. There are a few memory leaks in core-test which are detected by kmemleak. This patchset fixes the issues. This patch (of 2): When CONFIG_DAMON_KUNIT_TEST=y and making CONFIG_DEBUG_KMEMLEAK=y and CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y, the below memory leak is detected. The damon_region which is allocated by kmem_cache_alloc() in damon_new_region() in damon_test_regions() and damon_test_update_monitoring_result() are not freed. So for damon_test_regions(), replace damon_del_region() call with damon_destroy_region() so that it calls both damon_del_region() and damon_free_region(), the latter will free the damon_region. For damon_test_update_monitoring_result(), call damon_free_region() to free it. After applying this patch, the following memory leak is never detected. unreferenced object 0xffff2b49c3edc000 (size 56): comm "kunit_try_catch", pid 338, jiffies 4294895280 (age 557.084s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 02 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 49 2b ff ff ............I+.. backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<00000000b528f67c>] kmem_cache_alloc+0x168/0x284 [<000000008603f022>] damon_new_region+0x28/0x54 [<00000000a3b8c64e>] damon_test_regions+0x38/0x270 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 unreferenced object 0xffff2b49c5b20000 (size 56): comm "kunit_try_catch", pid 354, jiffies 4294895304 (age 556.988s) hex dump (first 32 bytes): 03 00 00 00 00 00 00 00 07 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 96 00 00 00 49 2b ff ff ............I+.. backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<00000000b528f67c>] kmem_cache_alloc+0x168/0x284 [<000000008603f022>] damon_new_region+0x28/0x54 [<00000000ca019f80>] damon_test_update_monitoring_result+0x18/0x34 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 Link: https://lkml.kernel.org/r/20230918120951.2230468-1-ruanjinjie@huawei.com Link: https://lkml.kernel.org/r/20230918120951.2230468-2-ruanjinjie@huawei.com Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Fixes: f4c978b6594b ("mm/damon/core-test: add a test for damon_update_monitoring_results()") Signed-off-by: Jinjie Ruan Reviewed-by: SeongJae Park Cc: Brendan Higgins Cc: Feng Tang Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 79f1f12e0dd5..3959be35b901 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -30,7 +30,7 @@ static void damon_test_regions(struct kunit *test) damon_add_region(r, t); KUNIT_EXPECT_EQ(test, 1u, damon_nr_regions(t)); - damon_del_region(r, t); + damon_destroy_region(r, t); KUNIT_EXPECT_EQ(test, 0u, damon_nr_regions(t)); damon_free_target(t); @@ -321,6 +321,8 @@ static void damon_test_update_monitoring_result(struct kunit *test) damon_update_monitoring_result(r, &old_attrs, &new_attrs); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 20); + + damon_free_region(r); } static void damon_test_set_attrs(struct kunit *test) -- cgit v1.2.3 From a0ce79253a96ee6bbcce90775ed342ef3153c68e Mon Sep 17 00:00:00 2001 From: Jinjie Ruan Date: Mon, 18 Sep 2023 20:09:51 +0800 Subject: mm/damon/core-test: fix memory leak in damon_new_ctx() When CONFIG_DAMON_KUNIT_TEST=y and making CONFIG_DEBUG_KMEMLEAK=y and CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y, the below memory leak is detected. The damon_ctx which is allocated by kzalloc() in damon_new_ctx() in damon_test_ops_registration() and damon_test_set_attrs() are not freed. So use damon_destroy_ctx() to free it. After applying this patch, the following memory leak is never detected unreferenced object 0xffff2b49c6968800 (size 512): comm "kunit_try_catch", pid 350, jiffies 4294895294 (age 557.028s) hex dump (first 32 bytes): 88 13 00 00 00 00 00 00 a0 86 01 00 00 00 00 00 ................ 00 87 93 03 00 00 00 00 0a 00 00 00 00 00 00 00 ................ backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<0000000073acab3b>] __kmem_cache_alloc_node+0x174/0x290 [<00000000b5f89cef>] kmalloc_trace+0x40/0x164 [<00000000eb19e83f>] damon_new_ctx+0x28/0xb4 [<00000000daf6227b>] damon_test_ops_registration+0x34/0x328 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 unreferenced object 0xffff2b49c1a9cc00 (size 512): comm "kunit_try_catch", pid 356, jiffies 4294895306 (age 557.000s) hex dump (first 32 bytes): 88 13 00 00 00 00 00 00 a0 86 01 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 0a 00 00 00 00 00 00 00 ................ backtrace: [<0000000088e71769>] slab_post_alloc_hook+0xb8/0x368 [<0000000073acab3b>] __kmem_cache_alloc_node+0x174/0x290 [<00000000b5f89cef>] kmalloc_trace+0x40/0x164 [<00000000eb19e83f>] damon_new_ctx+0x28/0xb4 [<00000000058495c4>] damon_test_set_attrs+0x30/0x1a8 [<00000000559c4801>] kunit_try_run_case+0x50/0xac [<000000003932ed49>] kunit_generic_run_threadfn_adapter+0x20/0x2c [<000000003c3e9211>] kthread+0x124/0x130 [<0000000028f85bdd>] ret_from_fork+0x10/0x20 Link: https://lkml.kernel.org/r/20230918120951.2230468-3-ruanjinjie@huawei.com Fixes: d1836a3b2a9a ("mm/damon/core-test: initialise context before test in damon_test_set_attrs()") Fixes: 4f540f5ab4f2 ("mm/damon/core-test: add a kunit test case for ops registration") Signed-off-by: Jinjie Ruan Reviewed-by: Feng Tang Reviewed-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 3959be35b901..649adf91ebc5 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -269,6 +269,8 @@ static void damon_test_ops_registration(struct kunit *test) /* Check double-registration failure again */ KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); + + damon_destroy_ctx(c); } static void damon_test_set_regions(struct kunit *test) @@ -346,6 +348,8 @@ static void damon_test_set_attrs(struct kunit *test) invalid_attrs = valid_attrs; invalid_attrs.aggr_interval = 4999; KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); + + damon_destroy_ctx(c); } static void damon_test_moving_sum(struct kunit *test) -- cgit v1.2.3 From 28e566572aacdc551e24649e57cc9f04ba880cd2 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:16 +0800 Subject: mm: add functions folio_in_range() and folio_within_vma() Patch series "support large folio for mlock", v3. Yu mentioned at [1] about the mlock() can't be applied to large folio. I leant the related code and here is my understanding: - For RLIMIT_MEMLOCK related, there is no problem. Because the RLIMIT_MEMLOCK statistics is not related underneath page. That means underneath page mlock or munlock doesn't impact the RLIMIT_MEMLOCK statistics collection which is always correct. - For keeping the page in RAM, there is no problem either. At least, during try_to_unmap_one(), once detect the VMA has VM_LOCKED bit set in vm_flags, the folio will be kept whatever the folio is mlocked or not. So the function of mlock for large folio works. But it's not optimized because the page reclaim needs scan these large folio and may split them. This series identified the large folio for mlock to four types: - The large folio is in VM_LOCKED range and fully mapped to the range - The large folio is in the VM_LOCKED range but not fully mapped to the range - The large folio cross VM_LOCKED VMA boundary - The large folio cross last level page table boundary For the first type, we mlock large folio so page reclaim will skip it. For the second/third type, we don't mlock large folio. As the pages not mapped to VM_LOACKED range are mapped to none VM_LOCKED range, if system is in memory pressure situation, the large folio can be picked by page reclaim and split. Then the pages not mapped to VM_LOCKED range can be reclaimed. For the fourth type, we don't mlock large folio because locking one page table lock can't prevent the part in another last level page table being unmapped. Thanks to Ryan for pointing this out. To check whether the folio is fully mapped to the range, PTEs needs be checked to see whether the page of folio is associated. Which needs take page table lock and is heavy operation. So far, the only place needs this check is madvise and page reclaim. These functions already have their own PTE iterator. patch1 introduce API to check whether large folio is in VMA range. patch2 make page reclaim/mlock_vma_folio/munlock_vma_folio support large folio mlock/munlock. patch3 make mlock/munlock syscall support large folio. Yu also mentioned a race which can make folio unevictable after munlock during RFC v2 discussion [3]: We decided that race issue didn't block this series based on: - That race issue was not introduced by this series - We had a looks-ok fix for that race issue. Need to wait for mlock_count fixing patch as Yosry Ahmed suggested [4] [1] https://lore.kernel.org/linux-mm/CAOUHufbtNPkdktjt_5qM45GegVO-rCFOMkSh0HQminQ12zsV8Q@mail.gmail.com/ [2] https://lore.kernel.org/linux-mm/20230809061105.3369958-1-fengwei.yin@intel.com/ [3] https://lore.kernel.org/linux-mm/CAOUHufZ6=9P_=CAOQyw0xw-3q707q-1FVV09dBNDC-hpcpj2Pg@mail.gmail.com/ This patch (of 3): folio_in_range() will be used to check whether the folio is mapped to specific VMA and whether the mapping address of folio is in the range. Also a helper function folio_within_vma() to check whether folio is in the range of vma based on folio_in_range(). Link: https://lkml.kernel.org/r/20230918073318.1181104-1-fengwei.yin@intel.com Link: https://lkml.kernel.org/r/20230918073318.1181104-2-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/internal.h | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/mm/internal.h b/mm/internal.h index f7c963dfbdb3..14f358d068ac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -586,6 +586,56 @@ extern long faultin_vma_page_range(struct vm_area_struct *vma, bool write, int *locked); extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, unsigned long bytes); + +/* + * NOTE: This function can't tell whether the folio is "fully mapped" in the + * range. + * "fully mapped" means all the pages of folio is associated with the page + * table of range while this function just check whether the folio range is + * within the range [start, end). Funcation caller nees to do page table + * check if it cares about the page table association. + * + * Typical usage (like mlock or madvise) is: + * Caller knows at least 1 page of folio is associated with page table of VMA + * and the range [start, end) is intersect with the VMA range. Caller wants + * to know whether the folio is fully associated with the range. It calls + * this function to check whether the folio is in the range first. Then checks + * the page table to know whether the folio is fully mapped to the range. + */ +static inline bool +folio_within_range(struct folio *folio, struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + pgoff_t pgoff, addr; + unsigned long vma_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + + VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio); + if (start > end) + return false; + + if (start < vma->vm_start) + start = vma->vm_start; + + if (end > vma->vm_end) + end = vma->vm_end; + + pgoff = folio_pgoff(folio); + + /* if folio start address is not in vma range */ + if (!in_range(pgoff, vma->vm_pgoff, vma_pglen)) + return false; + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + + return !(addr < start || end - addr < folio_size(folio)); +} + +static inline bool +folio_within_vma(struct folio *folio, struct vm_area_struct *vma) +{ + return folio_within_range(folio, vma, vma->vm_start, vma->vm_end); +} + /* * mlock_vma_folio() and munlock_vma_folio(): * should be called with vma's mmap_lock held for read or write, -- cgit v1.2.3 From 1acbc3f936146d1b34987294803ac131bc298ce8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:17 +0800 Subject: mm: handle large folio when large folio in VM_LOCKED VMA range If large folio is in the range of VM_LOCKED VMA, it should be mlocked to avoid being picked by page reclaim. Which may split the large folio and then mlock each pages again. Mlock this kind of large folio to prevent them being picked by page reclaim. For the large folio which cross the boundary of VM_LOCKED VMA or not fully mapped to VM_LOCKED VMA, we'd better not to mlock it. So if the system is under memory pressure, this kind of large folio will be split and the pages ouf of VM_LOCKED VMA can be reclaimed. Ideally, for large folio, we should mlock it when the large folio is fully mapped to VMA and munlock it if any page are unmampped from VMA. But it's not easy to detect whether the large folio is fully mapped to VMA in some cases (like add/remove rmap). So we update mlock_vma_folio() and munlock_vma_folio() to mlock/munlock the folio according to vma->vm_flags. Let caller to decide whether they should call these two functions. For add rmap, only mlock normal 4K folio and postpone large folio handling to page reclaim phase. It is possible to reuse page table iterator to detect whether folio is fully mapped or not during page reclaim phase. For remove rmap, invoke munlock_vma_folio() to munlock folio unconditionly because rmap makes folio not fully mapped to VMA. Link: https://lkml.kernel.org/r/20230918073318.1181104-3-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/internal.h | 23 ++++++++++++--------- mm/rmap.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 21 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 14f358d068ac..9e62c8b952b8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -644,14 +644,10 @@ folio_within_vma(struct folio *folio, struct vm_area_struct *vma) * mlock is usually called at the end of page_add_*_rmap(), munlock at * the end of page_remove_rmap(); but new anon folios are managed by * folio_add_lru_vma() calling mlock_new_folio(). - * - * @compound is used to include pmd mappings of THPs, but filter out - * pte mappings of THPs, which cannot be consistently counted: a pte - * mapping of the THP head cannot be distinguished by the page alone. */ void mlock_folio(struct folio *folio); static inline void mlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { /* * The VM_SPECIAL check here serves two purposes. @@ -661,17 +657,24 @@ static inline void mlock_vma_folio(struct folio *folio, * file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may * still be set while VM_SPECIAL bits are added: so ignore it then. */ - if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED) && - (compound || !folio_test_large(folio))) + if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED)) mlock_folio(folio); } void munlock_folio(struct folio *folio); static inline void munlock_vma_folio(struct folio *folio, - struct vm_area_struct *vma, bool compound) + struct vm_area_struct *vma) { - if (unlikely(vma->vm_flags & VM_LOCKED) && - (compound || !folio_test_large(folio))) + /* + * munlock if the function is called. Ideally, we should only + * do munlock if any page of folio is unmapped from VMA and + * cause folio not fully mapped to VMA. + * + * But it's not easy to confirm that's the situation. So we + * always munlock the folio and page reclaim will correct it + * if it's wrong. + */ + if (unlikely(vma->vm_flags & VM_LOCKED)) munlock_folio(folio); } diff --git a/mm/rmap.c b/mm/rmap.c index d24e2c36372e..c6bb2339e35b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -798,6 +798,7 @@ struct folio_referenced_arg { unsigned long vm_flags; struct mem_cgroup *memcg; }; + /* * arg: folio_referenced_arg will be passed */ @@ -807,17 +808,33 @@ static bool folio_referenced_one(struct folio *folio, struct folio_referenced_arg *pra = arg; DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0); int referenced = 0; + unsigned long start = address, ptes = 0; while (page_vma_mapped_walk(&pvmw)) { address = pvmw.address; - if ((vma->vm_flags & VM_LOCKED) && - (!folio_test_large(folio) || !pvmw.pte)) { - /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, !pvmw.pte); - page_vma_mapped_walk_done(&pvmw); - pra->vm_flags |= VM_LOCKED; - return false; /* To break the loop */ + if (vma->vm_flags & VM_LOCKED) { + if (!folio_test_large(folio) || !pvmw.pte) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + page_vma_mapped_walk_done(&pvmw); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + /* + * For large folio fully mapped to VMA, will + * be handled after the pvmw loop. + * + * For large folio cross VMA boundaries, it's + * expected to be picked by page reclaim. But + * should skip reference of pages which are in + * the range of VM_LOCKED vma. As page reclaim + * should just count the reference of pages out + * the range of VM_LOCKED vma. + */ + ptes++; + pra->mapcount--; + continue; } if (pvmw.pte) { @@ -842,6 +859,23 @@ static bool folio_referenced_one(struct folio *folio, pra->mapcount--; } + if ((vma->vm_flags & VM_LOCKED) && + folio_test_large(folio) && + folio_within_vma(folio, vma)) { + unsigned long s_align, e_align; + + s_align = ALIGN_DOWN(start, PMD_SIZE); + e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE); + + /* folio doesn't cross page table boundary and fully mapped */ + if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) { + /* Restore the mlock which got missed */ + mlock_vma_folio(folio, vma); + pra->vm_flags |= VM_LOCKED; + return false; /* To break the loop */ + } + } + if (referenced) folio_clear_idle(folio); if (folio_test_clear_young(folio)) @@ -1254,7 +1288,14 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, (folio_test_large(folio) && folio_entire_mapcount(folio) > 1)) && PageAnonExclusive(page), folio); - mlock_vma_folio(folio, vma, compound); + /* + * For large folio, only mlock it if it's fully mapped to VMA. It's + * not easy to check whether the large folio is fully mapped to VMA + * here. Only mlock normal 4K folio and leave page reclaim to handle + * large folio. + */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1354,7 +1395,9 @@ void folio_add_file_rmap_range(struct folio *folio, struct page *page, if (nr) __lruvec_stat_mod_folio(folio, NR_FILE_MAPPED, nr); - mlock_vma_folio(folio, vma, compound); + /* See comments in page_add_anon_rmap() */ + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); } /** @@ -1465,7 +1508,7 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma, * it's only reliable while mapped. */ - munlock_vma_folio(folio, vma, compound); + munlock_vma_folio(folio, vma); } /* @@ -1530,7 +1573,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* Restore the mlock which got missed */ - mlock_vma_folio(folio, vma, false); + if (!folio_test_large(folio)) + mlock_vma_folio(folio, vma); page_vma_mapped_walk_done(&pvmw); ret = false; break; -- cgit v1.2.3 From dc68badcede4ec3b4e5cdfcb8f678670220ac2ca Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Mon, 18 Sep 2023 15:33:18 +0800 Subject: mm: mlock: update mlock_pte_range to handle large folio Current kernel only lock base size folio during mlock syscall. Add large folio support with following rules: - Only mlock large folio when it's in VM_LOCKED VMA range and fully mapped to page table. fully mapped folio is required as if folio is not fully mapped to a VM_LOCKED VMA, if system is in memory pressure, page reclaim is allowed to pick up this folio, split it and reclaim the pages which are not in VM_LOCKED VMA. - munlock will apply to the large folio which is in VMA range or cross the VMA boundary. This is required to handle the case that the large folio is mlocked, later the VMA is split in the middle of large folio. Link: https://lkml.kernel.org/r/20230918073318.1181104-4-fengwei.yin@intel.com Signed-off-by: Yin Fengwei Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Ryan Roberts Cc: Yang Shi Cc: Yosry Ahmed Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/mlock.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 64 insertions(+), 2 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 06bdfab83b58..42b6865f8f82 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -305,6 +305,58 @@ void munlock_folio(struct folio *folio) local_unlock(&mlock_fbatch.lock); } +static inline unsigned int folio_mlock_step(struct folio *folio, + pte_t *pte, unsigned long addr, unsigned long end) +{ + unsigned int count, i, nr = folio_nr_pages(folio); + unsigned long pfn = folio_pfn(folio); + pte_t ptent = ptep_get(pte); + + if (!folio_test_large(folio)) + return 1; + + count = pfn + nr - pte_pfn(ptent); + count = min_t(unsigned int, count, (end - addr) >> PAGE_SHIFT); + + for (i = 0; i < count; i++, pte++) { + pte_t entry = ptep_get(pte); + + if (!pte_present(entry)) + break; + if (pte_pfn(entry) - pfn >= nr) + break; + } + + return i; +} + +static inline bool allow_mlock_munlock(struct folio *folio, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, unsigned int step) +{ + /* + * For unlock, allow munlock large folio which is partially + * mapped to VMA. As it's possible that large folio is + * mlocked and VMA is split later. + * + * During memory pressure, such kind of large folio can + * be split. And the pages are not in VM_LOCKed VMA + * can be reclaimed. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return true; + + /* folio not in range [start, end), skip mlock */ + if (!folio_within_range(folio, vma, start, end)) + return false; + + /* folio is not fully mapped, skip mlock */ + if (step != folio_nr_pages(folio)) + return false; + + return true; +} + static int mlock_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -314,6 +366,8 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, pte_t *start_pte, *pte; pte_t ptent; struct folio *folio; + unsigned int step = 1; + unsigned long start = addr; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { @@ -334,6 +388,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, walk->action = ACTION_AGAIN; return 0; } + for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); if (!pte_present(ptent)) @@ -341,12 +396,19 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr, folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; - if (folio_test_large(folio)) - continue; + + step = folio_mlock_step(folio, pte, addr, end); + if (!allow_mlock_munlock(folio, vma, start, end, step)) + goto next_entry; + if (vma->vm_flags & VM_LOCKED) mlock_folio(folio); else munlock_folio(folio); + +next_entry: + pte += step - 1; + addr += (step - 1) << PAGE_SHIFT; } pte_unmap(start_pte); out: -- cgit v1.2.3 From 51a23b1be92046f0cc52384d30cf060700f1a54e Mon Sep 17 00:00:00 2001 From: Li Zhijian Date: Wed, 2 Aug 2023 17:28:56 +0800 Subject: acpi,mm: fix typo sibiling -> sibling First found this typo as reviewing memory tier code. Fix it by sed like: $ sed -i 's/sibiling/sibling/g' $(git grep -l sibiling) so the acpi one will be corrected as well. Link: https://lkml.kernel.org/r/20230802092856.819328-1-lizhijian@cn.fujitsu.com Signed-off-by: Li Zhijian Cc: Aneesh Kumar K.V Cc: Huang, Ying Cc: Len Brown Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- drivers/acpi/acpi_pad.c | 2 +- include/linux/memory-tiers.h | 2 +- mm/memory-tiers.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c index 7a453c5ff303..7f073ca64f0e 100644 --- a/drivers/acpi/acpi_pad.c +++ b/drivers/acpi/acpi_pad.c @@ -100,7 +100,7 @@ static void round_robin_cpu(unsigned int tsk_index) for_each_cpu(cpu, pad_busy_cpus) cpumask_or(tmp, tmp, topology_sibling_cpumask(cpu)); cpumask_andnot(tmp, cpu_online_mask, tmp); - /* avoid HT sibilings if possible */ + /* avoid HT siblings if possible */ if (cpumask_empty(tmp)) cpumask_andnot(tmp, cpu_online_mask, pad_busy_cpus); if (cpumask_empty(tmp)) { diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 437441cdf78f..4fa178b50784 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -22,7 +22,7 @@ struct memory_tier; struct memory_dev_type { /* list of memory types that are part of same tier as this type */ - struct list_head tier_sibiling; + struct list_head tier_sibling; /* abstract distance for this specific memory type */ int adistance; /* Nodes of same abstract distance */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 37a4f59d9585..876d8a5e210e 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -115,7 +115,7 @@ static __always_inline nodemask_t get_memtier_nodemask(struct memory_tier *memti nodemask_t nodes = NODE_MASK_NONE; struct memory_dev_type *memtype; - list_for_each_entry(memtype, &memtier->memory_types, tier_sibiling) + list_for_each_entry(memtype, &memtier->memory_types, tier_sibling) nodes_or(nodes, nodes, memtype->nodes); return nodes; @@ -174,7 +174,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty * If the memtype is already part of a memory tier, * just return that. */ - if (!list_empty(&memtype->tier_sibiling)) { + if (!list_empty(&memtype->tier_sibling)) { list_for_each_entry(memtier, &memory_tiers, list) { if (adistance == memtier->adistance_start) return memtier; @@ -218,7 +218,7 @@ static struct memory_tier *find_create_memory_tier(struct memory_dev_type *memty memtier = new_memtier; link_memtype: - list_add(&memtype->tier_sibiling, &memtier->memory_types); + list_add(&memtype->tier_sibling, &memtier->memory_types); return memtier; } @@ -527,7 +527,7 @@ static bool clear_node_memory_tier(int node) memtype = node_memory_types[node].memtype; node_clear(node, memtype->nodes); if (nodes_empty(memtype->nodes)) { - list_del_init(&memtype->tier_sibiling); + list_del_init(&memtype->tier_sibling); if (list_empty(&memtier->memory_types)) destroy_memory_tier(memtier); } @@ -553,7 +553,7 @@ struct memory_dev_type *alloc_memory_type(int adistance) return ERR_PTR(-ENOMEM); memtype->adistance = adistance; - INIT_LIST_HEAD(&memtype->tier_sibiling); + INIT_LIST_HEAD(&memtype->tier_sibling); memtype->nodes = NODE_MASK_NONE; kref_init(&memtype->kref); return memtype; -- cgit v1.2.3 From 55d2a0bd5eadaade850efa9d3a7ffbb0aeb67198 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Mon, 18 Sep 2023 14:31:42 +0800 Subject: mm: add statistics for PUD level pagetable Recently, we found that cross-die access to pagetable pages on ARM64 machines can cause performance fluctuations in our business. Currently, there are no PMU events available to track this situation on our ARM64 machines, so accurate pagetable accounting can help to analyze this issue, but now the PUD level pagetable accounting is missed. So introduce pagetable_pud_ctor/dtor() to help to get accurate PUD pagetable accounting, as well as converting the architectures which use generic PUD pagetable allocation to add corresponding PUD pagetable accounting. Moreover this patch will mark the PUD level pagetable with PG_table flag, which will help to do sanity validation in unpoison_memory(). On my testing machine, I can see more pagetables statistics after the patch with page-types tool: Before patch: flags page-count MB symbolic-flags long-symbolic-flags 0x0000000004000000 27326 106 __________________________g_________________ pgtable After patch: 0x0000000004000000 27541 107 __________________________g_________________ pgtable Link: https://lkml.kernel.org/r/876c71c03a7e69c17722a690e3225a4f7b172fb2.1695017383.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: Mike Rapoport (IBM) Acked-by: Vishal Moola (Oracle) Cc: Andy Lutomirski Cc: Aneesh Kumar K.V Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Huacai Chen Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlb.h | 5 ++++- arch/loongarch/include/asm/pgalloc.h | 1 + arch/mips/include/asm/pgalloc.h | 1 + arch/x86/mm/pgtable.c | 3 +++ include/asm-generic/pgalloc.h | 7 ++++++- include/linux/mm.h | 16 ++++++++++++++++ 6 files changed, 31 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 2c29239d05c3..846c563689a8 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -96,7 +96,10 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, unsigned long addr) { - tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp)); + struct ptdesc *ptdesc = virt_to_ptdesc(pudp); + + pagetable_pud_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #endif diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index 79470f0b4f1d..4e2d6b7ca2ee 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -84,6 +84,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) if (!ptdesc) return NULL; + pagetable_pud_ctor(ptdesc); pud = ptdesc_address(ptdesc); pud_init(pud); diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 40e40a7eb94a..f4440edcd8fe 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -95,6 +95,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) if (!ptdesc) return NULL; + pagetable_pud_ctor(ptdesc); pud = ptdesc_address(ptdesc); pud_init(pud); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 9deadf517f14..0cbc1b8e8e3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -76,6 +76,9 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { + struct ptdesc *ptdesc = virt_to_ptdesc(pud); + + pagetable_pud_dtor(ptdesc); paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index c75d4a753849..879e5f8aa5e9 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -169,6 +169,8 @@ static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) return NULL; + + pagetable_pud_ctor(ptdesc); return ptdesc_address(ptdesc); } @@ -190,8 +192,11 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { + struct ptdesc *ptdesc = virt_to_ptdesc(pud); + BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - pagetable_free(virt_to_ptdesc(pud)); + pagetable_pud_dtor(ptdesc); + pagetable_free(ptdesc); } #ifndef __HAVE_ARCH_PUD_FREE diff --git a/include/linux/mm.h b/include/linux/mm.h index 31dc25d3f6b5..126b54b45442 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3049,6 +3049,22 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) return ptl; } +static inline void pagetable_pud_ctor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); +} + +static inline void pagetable_pud_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + extern void __init pagecache_init(void); extern void free_initmem(void); -- cgit v1.2.3 From 72a14e821cba73f74aca4bc5f768d77dece8bdb2 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 15 Sep 2023 18:58:44 +0800 Subject: memcg: expose swapcache stat for memcg v1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Expose swapcache stat for memcg v1", v2. Since commit b6038942480e ("mm: memcg: add swapcache stat for memcg v2") adds swapcache stat for the cgroup v2, it seems there is no reason to hide it in memcg v1. Conversely, with swapcached it is more accurate to evaluate the available memory for memcg. Link: https://lkml.kernel.org/r/20230915105845.3199656-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20230915105845.3199656-2-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Yosry Ahmed Acked-by: Tejun Heo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Michal Hocko Cc: Michal Koutný Cc: Zefan Li Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 1 + mm/memcontrol.c | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ff456871bf4b..ca7d9402f6be 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -551,6 +551,7 @@ memory.stat file includes following statistics: event happens each time a page is unaccounted from the cgroup. swap # of bytes of swap usage + swapcached # of bytes of swap cached in memory dirty # of bytes that are waiting to get written back to the disk. writeback # of bytes of file/anon cache that are queued for syncing to disk. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68313331971c..7b50f214f9d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4068,7 +4068,10 @@ static const unsigned int memcg1_stats[] = { NR_WRITEBACK, WORKINGSET_REFAULT_ANON, WORKINGSET_REFAULT_FILE, +#ifdef CONFIG_SWAP MEMCG_SWAP, + NR_SWAPCACHE, +#endif }; static const char *const memcg1_stat_names[] = { @@ -4083,7 +4086,10 @@ static const char *const memcg1_stat_names[] = { "writeback", "workingset_refault_anon", "workingset_refault_file", +#ifdef CONFIG_SWAP "swap", + "swapcached", +#endif }; /* Universal VM events cgroup1 shows, original sort order */ -- cgit v1.2.3 From 840ea53a8dec3aa5773f7957d4eaafdf925c664a Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 15 Sep 2023 18:58:45 +0800 Subject: memcg: remove unused do_memsw_account in memcg1_stat_format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit b25806dcd3d5("mm: memcontrol: deprecate swapaccounting=0 mode") do_memsw_account() is synonymous with !cgroup_subsys_on_dfl(memory_cgrp_subsys), It always equals true in memcg1_stat_format(). Remove the unused code. Link: https://lkml.kernel.org/r/20230915105845.3199656-3-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Michal Koutný Reviewed-by: Yosry Ahmed Acked-by: Tejun heo Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Zefan Li Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7b50f214f9d6..8713bc796c77 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4113,8 +4113,6 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) - continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr * memcg_page_state_unit(memcg1_stats[i])); @@ -4137,15 +4135,12 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) } seq_buf_printf(s, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); - if (do_memsw_account()) - seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", - (u64)memsw * PAGE_SIZE); + seq_buf_printf(s, "hierarchical_memsw_limit %llu\n", + (u64)memsw * PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) - continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * memcg_page_state_unit(memcg1_stats[i])); -- cgit v1.2.3 From 7ced098fcfe596feab3cea4f40128b0119c7bf1a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 18 Sep 2023 22:18:32 +0200 Subject: mm: document mmu_notifier_invalidate_range_start_nonblock() Document what mmu_notifier_invalidate_range_start_nonblock() is for. Also add a __must_check annotation to signal that callers must bail out if a notifier vetoes the operation. Link: https://lkml.kernel.org/r/20230918201832.265108-1-jannh@google.com Signed-off-by: Jann Horn Reviewed-by: Jason Gunthorpe Reviewed-by: Alistair Popple Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 6e3c857606f1..f349e08a9dfe 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -459,7 +459,14 @@ mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) lock_map_release(&__mmu_notifier_invalidate_range_start_map); } -static inline int +/* + * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it + * can return an error if a notifier can't proceed without blocking, in which + * case you're not allowed to modify PTEs in the specified range. + * + * This is mainly intended for OOM handling. + */ +static inline int __must_check mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; -- cgit v1.2.3 From 76a0fb4fd5c940fe4977f2fbdb08167fa16e32f6 Mon Sep 17 00:00:00 2001 From: liwenyu Date: Wed, 20 Sep 2023 17:38:49 +0800 Subject: delayacct: add memory reclaim delay in get_page_from_freelist The current memory reclaim delay statistics only count the direct memory reclaim of the task in do_try_to_free_pages(). In systems with NUMA open, some tasks occasionally experience slower response times, but the total count of reclaim does not increase, using ftrace can show that node_reclaim has occurred. The memory reclaim occurring in get_page_from_freelist() is also due to heavy memory load. To get the impact of tasks in memory reclaim, this patch adds the statistics of the memory reclaim delay statistics for __node_reclaim(). Link: https://lkml.kernel.org/r/181C946095F0252B+7cc60eca-1abf-4502-aad3-ffd8ef89d910@ex.bilibili.com Signed-off-by: Wen Yu Li Cc: Balbir Singh Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3df0e2a59052..55424215f759 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -7326,6 +7326,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in cond_resched(); psi_memstall_enter(&pflags); + delayacct_freepages_start(); fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP @@ -7348,6 +7349,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); psi_memstall_leave(&pflags); + delayacct_freepages_end(); trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); -- cgit v1.2.3 From 29d68b219ff858e11b6bc1477fa7af58db6895c9 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:53 +0200 Subject: kselftest: vm: fix tabs/spaces inconsistency in the mdwe test Patch series "MDWE without inheritance", v4. Joey recently introduced a Memory-Deny-Write-Executable (MDWE) prctl which tags current with a flag that prevents pages that were previously not executable from becoming executable. This tag always gets inherited by children tasks. (it's in MMF_INIT_MASK) At Google, we've been using a somewhat similar downstream patch for a few years now. To make the adoption of this feature easier, we've had it support a mode in which the W^X flag does not propagate to children. For example, this is handy if a C process which wants W^X protection suspects it could start children processes that would use a JIT. I'd like to align our features with the upstream prctl. This series proposes a new NO_INHERIT flag to the MDWE prctl to make this kind of adoption easier. It sets a different flag in current that is not in MMF_INIT_MASK and which does not propagate. As part of looking into MDWE, I also fixed a couple of things in the MDWE test. The background for this was discussed in these threads: v1: https://lore.kernel.org/all/66900d0ad42797a55259061f757beece@ispras.ru/ v2: https://lore.kernel.org/all/d7e3749c-a718-df94-92af-1cb0fecab772@redhat.com/ This patch (of 6): Fix tabs/spaces inconsistency in the mdwe test. Link: https://lkml.kernel.org/r/20230828150858.393570-1-revest@chromium.org Link: https://lkml.kernel.org/r/20230828150858.393570-2-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Acked-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Ryan Roberts Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mdwe_test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index bc91bef5d254..d0954c657feb 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -49,19 +49,19 @@ FIXTURE_VARIANT(mdwe) FIXTURE_VARIANT_ADD(mdwe, stock) { - .enabled = false, + .enabled = false, .forked = false, }; FIXTURE_VARIANT_ADD(mdwe, enabled) { - .enabled = true, + .enabled = true, .forked = false, }; FIXTURE_VARIANT_ADD(mdwe, forked) { - .enabled = true, + .enabled = true, .forked = true, }; -- cgit v1.2.3 From a27e2e2d465e4ed73371974040689ac3e78fe3ee Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:54 +0200 Subject: kselftest: vm: fix mdwe's mmap_FIXED test case I checked with the original author, the mmap_FIXED test case wasn't properly tested and fails. Currently, it maps two consecutive (non overlapping) pages and expects the second mapping to be denied by MDWE but these two pages have nothing to do with each other so MDWE is actually out of the picture here. What the test actually intended to do was to remap a virtual address using MAP_FIXED. However, this operation unmaps the existing mapping and creates a new one so the va is backed by a new page and MDWE is again out of the picture, all remappings should succeed. This patch keeps the test case to make it clear that this situation is expected to work: MDWE shouldn't block a MAP_FIXED replacement. Link: https://lkml.kernel.org/r/20230828150858.393570-3-revest@chromium.org Fixes: 4cf1fe34fd18 ("kselftest: vm: add tests for memory-deny-write-execute") Signed-off-by: Florent Revest Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Reviewed-by: Ryan Roberts Tested-by: Ryan Roberts Tested-by: Ayush Jain Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mdwe_test.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index d0954c657feb..91aa9c3099e7 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -168,13 +168,10 @@ TEST_F(mdwe, mmap_FIXED) self->p = mmap(NULL, self->size, PROT_READ, self->flags, 0, 0); ASSERT_NE(self->p, MAP_FAILED); - p = mmap(self->p + self->size, self->size, PROT_READ | PROT_EXEC, + /* MAP_FIXED unmaps the existing page before mapping which is allowed */ + p = mmap(self->p, self->size, PROT_READ | PROT_EXEC, self->flags | MAP_FIXED, 0, 0); - if (variant->enabled) { - EXPECT_EQ(p, MAP_FAILED); - } else { - EXPECT_EQ(p, self->p); - } + EXPECT_EQ(p, self->p); } TEST_F(mdwe, arm64_BTI) -- cgit v1.2.3 From c93d05a729f9646a38aaf53781b447c99cfd86fb Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:55 +0200 Subject: kselftest: vm: check errnos in mdwe_test Invalid prctls return a negative code and set errno. It's good practice to check that errno is set as expected. Link: https://lkml.kernel.org/r/20230828150858.393570-4-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: Kees Cook Acked-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mdwe_test.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index 91aa9c3099e7..1b84cf8e1bbe 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -23,14 +23,22 @@ TEST(prctl_flags) { EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_SET_MDWE, 0L, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_GET_MDWE, 7L, 0L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_GET_MDWE, 0L, 7L, 0L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 7L, 0L), 0); + EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_GET_MDWE, 0L, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); } FIXTURE(mdwe) -- cgit v1.2.3 From 0da668333fb07805c2836d5d50e26eda915b24a1 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:56 +0200 Subject: mm: make PR_MDWE_REFUSE_EXEC_GAIN an unsigned long Defining a prctl flag as an int is a footgun because on a 64 bit machine and with a variadic implementation of prctl (like in musl and glibc), when used directly as a prctl argument, it can get casted to long with garbage upper bits which would result in unexpected behaviors. This patch changes the constant to an unsigned long to eliminate that possibilities. This does not break UAPI. I think that a stable backport would be "nice to have": to reduce the chances that users build binaries that could end up with garbage bits in their MDWE prctl arguments. We are not aware of anyone having yet encountered this corner case with MDWE prctls but a backport would reduce the likelihood it happens, since this sort of issues has happened with other prctls. But If this is perceived as a backporting burden, I suppose we could also live without a stable backport. Link: https://lkml.kernel.org/r/20230828150858.393570-5-revest@chromium.org Fixes: b507808ebce2 ("mm: implement memory-deny-write-execute as a prctl") Signed-off-by: Florent Revest Suggested-by: Alexey Izbyshev Reviewed-by: David Hildenbrand Reviewed-by: Kees Cook Acked-by: Catalin Marinas Cc: Anshuman Khandual Cc: Ayush Jain Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Cc: Signed-off-by: Andrew Morton --- include/uapi/linux/prctl.h | 2 +- tools/include/uapi/linux/prctl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 3c36aeade991..9a85c69782bd 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -283,7 +283,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 -# define PR_MDWE_REFUSE_EXEC_GAIN 1 +# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) #define PR_GET_MDWE 66 diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 3c36aeade991..9a85c69782bd 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -283,7 +283,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 -# define PR_MDWE_REFUSE_EXEC_GAIN 1 +# define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) #define PR_GET_MDWE 66 -- cgit v1.2.3 From 24e41bf8a6b424c76c5902fb999e9eca61bdf83d Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:57 +0200 Subject: mm: add a NO_INHERIT flag to the PR_SET_MDWE prctl This extends the current PR_SET_MDWE prctl arg with a bit to indicate that the process doesn't want MDWE protection to propagate to children. To implement this no-inherit mode, the tag in current->mm->flags must be absent from MMF_INIT_MASK. This means that the encoding for "MDWE but without inherit" is different in the prctl than in the mm flags. This leads to a bit of bit-mangling in the prctl implementation. Link: https://lkml.kernel.org/r/20230828150858.393570-6-revest@chromium.org Signed-off-by: Florent Revest Reviewed-by: Kees Cook Reviewed-by: Catalin Marinas Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 10 ++++++++++ include/uapi/linux/prctl.h | 1 + kernel/fork.c | 2 +- kernel/sys.c | 32 ++++++++++++++++++++++++++------ tools/include/uapi/linux/prctl.h | 1 + 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 0ee96ea7a0e9..1b37fa8fc723 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -91,4 +91,14 @@ static inline int get_dumpable(struct mm_struct *mm) MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) #define MMF_VM_MERGE_ANY 29 +#define MMF_HAS_MDWE_NO_INHERIT 30 + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 diff --git a/kernel/fork.c b/kernel/fork.c index 1779183a7cb3..e45a4457ba83 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1288,7 +1288,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, hugetlb_count_init(mm); if (current->mm) { - mm->flags = current->mm->flags & MMF_INIT_MASK; + mm->flags = mmf_init_flags(current->mm->flags); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; diff --git a/kernel/sys.c b/kernel/sys.c index 2410e3999ebe..4a8073c1b255 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2368,19 +2368,41 @@ static int prctl_set_vma(unsigned long opt, unsigned long start, } #endif /* CONFIG_ANON_VMA_NAME */ +static inline unsigned long get_current_mdwe(void) +{ + unsigned long ret = 0; + + if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + ret |= PR_MDWE_REFUSE_EXEC_GAIN; + if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags)) + ret |= PR_MDWE_NO_INHERIT; + + return ret; +} + static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3, unsigned long arg4, unsigned long arg5) { + unsigned long current_bits; + if (arg3 || arg4 || arg5) return -EINVAL; - if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN)) + if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) + return -EINVAL; + + /* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ + if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL; + current_bits = get_current_mdwe(); + if (current_bits && current_bits != bits) + return -EPERM; /* Cannot unset the flags */ + + if (bits & PR_MDWE_NO_INHERIT) + set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN) set_bit(MMF_HAS_MDWE, ¤t->mm->flags); - else if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) - return -EPERM; /* Cannot unset the flag */ return 0; } @@ -2390,9 +2412,7 @@ static inline int prctl_get_mdwe(unsigned long arg2, unsigned long arg3, { if (arg2 || arg3 || arg4 || arg5) return -EINVAL; - - return test_bit(MMF_HAS_MDWE, ¤t->mm->flags) ? - PR_MDWE_REFUSE_EXEC_GAIN : 0; + return get_current_mdwe(); } static int prctl_get_auxv(void __user *addr, unsigned long len) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index 9a85c69782bd..370ed14b1ae0 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -284,6 +284,7 @@ struct prctl_mm_map { /* Memory deny write / execute */ #define PR_SET_MDWE 65 # define PR_MDWE_REFUSE_EXEC_GAIN (1UL << 0) +# define PR_MDWE_NO_INHERIT (1UL << 1) #define PR_GET_MDWE 66 -- cgit v1.2.3 From 2dc539ac4d2f556105ea7e2a37d23379a79a47eb Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Mon, 28 Aug 2023 17:08:58 +0200 Subject: kselftest: vm: add tests for no-inherit memory-deny-write-execute Add some tests to cover the new PR_MDWE_NO_INHERIT flag of the PR_SET_MDWE prctl. Check that: - it can't be set without PR_SET_MDWE - MDWE flags can't be unset - when set, PR_SET_MDWE doesn't propagate to children Link: https://lkml.kernel.org/r/20230828150858.393570-7-revest@chromium.org Signed-off-by: Florent Revest Acked-by: Catalin Marinas Reviewed-by: Kees Cook Cc: Alexey Izbyshev Cc: Anshuman Khandual Cc: Ayush Jain Cc: David Hildenbrand Cc: Greg Thelen Cc: Joey Gouly Cc: KP Singh Cc: Mark Brown Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Szabolcs Nagy Cc: Topi Miettinen Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mdwe_test.c | 114 +++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/mdwe_test.c b/tools/testing/selftests/mm/mdwe_test.c index 1b84cf8e1bbe..200bedcdc32e 100644 --- a/tools/testing/selftests/mm/mdwe_test.c +++ b/tools/testing/selftests/mm/mdwe_test.c @@ -22,6 +22,9 @@ TEST(prctl_flags) { + EXPECT_LT(prctl(PR_SET_MDWE, PR_MDWE_NO_INHERIT, 0L, 0L, 7L), 0); + EXPECT_EQ(errno, EINVAL); + EXPECT_LT(prctl(PR_SET_MDWE, 7L, 0L, 0L, 0L), 0); EXPECT_EQ(errno, EINVAL); EXPECT_LT(prctl(PR_SET_MDWE, 0L, 7L, 0L, 0L), 0); @@ -41,6 +44,84 @@ TEST(prctl_flags) EXPECT_EQ(errno, EINVAL); } +FIXTURE(consecutive_prctl_flags) {}; +FIXTURE_SETUP(consecutive_prctl_flags) {} +FIXTURE_TEARDOWN(consecutive_prctl_flags) {} + +FIXTURE_VARIANT(consecutive_prctl_flags) +{ + unsigned long first_flags; + unsigned long second_flags; + bool should_work; +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_no_flags) +{ + .first_flags = 0, + .second_flags = 0, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_exec_gain) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, can_keep_both_flags) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .should_work = true, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = 0, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_mdwe_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = 0, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_disable_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .should_work = false, +}; + +FIXTURE_VARIANT_ADD(consecutive_prctl_flags, cant_enable_no_inherit) +{ + .first_flags = PR_MDWE_REFUSE_EXEC_GAIN, + .second_flags = PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT, + .should_work = false, +}; + +TEST_F(consecutive_prctl_flags, two_prctls) +{ + int ret; + + EXPECT_EQ(prctl(PR_SET_MDWE, variant->first_flags, 0L, 0L, 0L), 0); + + ret = prctl(PR_SET_MDWE, variant->second_flags, 0L, 0L, 0L); + if (variant->should_work) { + EXPECT_EQ(ret, 0); + + ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); + ASSERT_EQ(ret, variant->second_flags); + } else { + EXPECT_NE(ret, 0); + ASSERT_EQ(errno, EPERM); + } +} + FIXTURE(mdwe) { void *p; @@ -53,28 +134,45 @@ FIXTURE_VARIANT(mdwe) { bool enabled; bool forked; + bool inherit; }; FIXTURE_VARIANT_ADD(mdwe, stock) { .enabled = false, .forked = false, + .inherit = false, }; FIXTURE_VARIANT_ADD(mdwe, enabled) { .enabled = true, .forked = false, + .inherit = true, }; -FIXTURE_VARIANT_ADD(mdwe, forked) +FIXTURE_VARIANT_ADD(mdwe, inherited) { .enabled = true, .forked = true, + .inherit = true, }; +FIXTURE_VARIANT_ADD(mdwe, not_inherited) +{ + .enabled = true, + .forked = true, + .inherit = false, +}; + +static bool executable_map_should_fail(const FIXTURE_VARIANT(mdwe) *variant) +{ + return variant->enabled && (!variant->forked || variant->inherit); +} + FIXTURE_SETUP(mdwe) { + unsigned long mdwe_flags; int ret, status; self->p = NULL; @@ -84,13 +182,17 @@ FIXTURE_SETUP(mdwe) if (!variant->enabled) return; - ret = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0L, 0L, 0L); + mdwe_flags = PR_MDWE_REFUSE_EXEC_GAIN; + if (!variant->inherit) + mdwe_flags |= PR_MDWE_NO_INHERIT; + + ret = prctl(PR_SET_MDWE, mdwe_flags, 0L, 0L, 0L); ASSERT_EQ(ret, 0) { TH_LOG("PR_SET_MDWE failed or unsupported"); } ret = prctl(PR_GET_MDWE, 0L, 0L, 0L, 0L); - ASSERT_EQ(ret, 1); + ASSERT_EQ(ret, mdwe_flags); if (variant->forked) { self->pid = fork(); @@ -121,7 +223,7 @@ TEST_F(mdwe, mmap_READ_EXEC) TEST_F(mdwe, mmap_WRITE_EXEC) { self->p = mmap(NULL, self->size, PROT_WRITE | PROT_EXEC, self->flags, 0, 0); - if (variant->enabled) { + if (executable_map_should_fail(variant)) { EXPECT_EQ(self->p, MAP_FAILED); } else { EXPECT_NE(self->p, MAP_FAILED); @@ -147,7 +249,7 @@ TEST_F(mdwe, mprotect_add_EXEC) ASSERT_NE(self->p, MAP_FAILED); ret = mprotect(self->p, self->size, PROT_READ | PROT_EXEC); - if (variant->enabled) { + if (executable_map_should_fail(variant)) { EXPECT_LT(ret, 0); } else { EXPECT_EQ(ret, 0); @@ -162,7 +264,7 @@ TEST_F(mdwe, mprotect_WRITE_EXEC) ASSERT_NE(self->p, MAP_FAILED); ret = mprotect(self->p, self->size, PROT_WRITE | PROT_EXEC); - if (variant->enabled) { + if (executable_map_should_fail(variant)) { EXPECT_LT(ret, 0); } else { EXPECT_EQ(ret, 0); -- cgit v1.2.3 From d98388cef5315d0235fdcad26600102f54966927 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Thu, 21 Sep 2023 16:15:35 +0800 Subject: mm/filemap: increase usage of folio_next_index() helper Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index() in filemap_map_pages(). Link: https://lkml.kernel.org/r/20230921081535.3398-1-duminjie@vivo.com Signed-off-by: Minjie Du Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yin Fengwei Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index f0a15ce1bd1b..9481ffaf24e6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3591,7 +3591,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - end = folio->index + folio_nr_pages(folio) - 1; + end = folio_next_index(folio) - 1; nr_pages = min(end, end_pgoff) - xas.xa_index + 1; if (!folio_test_large(folio)) -- cgit v1.2.3 From 65610453459f9048678a0daef89d592e412ec00a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:12 +0800 Subject: mm: memory: add vm_normal_folio_pmd() Patch series "mm: convert numa balancing functions to use a folio", v2. do_numa_pages() only handles non-compound pages, and only PMD-mapped THPs are handled in do_huge_pmd_numa_page(). But a large, PTE-mapped folio will be supported so let's convert more numa balancing functions to use/take a folio in preparation for that, no functional change intended for now. This patch (of 6): The new vm_normal_folio_pmd() wrapper is similar to vm_normal_folio(), which allow them to completely replace the struct page variables with struct folio variables. Link: https://lkml.kernel.org/r/20230921074417.24004-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230921074417.24004-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ mm/memory.c | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 126b54b45442..52c40b3d0813 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2327,6 +2327,8 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); diff --git a/mm/memory.c b/mm/memory.c index d956b231e835..311e862c6404 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -689,6 +689,16 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, out: return pfn_to_page(pfn); } + +struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page = vm_normal_page_pmd(vma, addr, pmd); + + if (page) + return page_folio(page); + return NULL; +} #endif static void restore_exclusive_pte(struct vm_area_struct *vma, -- cgit v1.2.3 From 667ffc31aa95e7023707924b08415523208bce9d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:13 +0800 Subject: mm: huge_memory: use a folio in do_huge_pmd_numa_page() Use a folio in do_huge_pmd_numa_page(), reduce three page_folio() calls to one, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5c20e43782e4..5baf9b6dc522 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1517,9 +1517,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; pmd_t oldpmd = vmf->orig_pmd; pmd_t pmd; - struct page *page; + struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - int page_nid = NUMA_NO_NODE; + int nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); bool migrated = false, writable = false; int flags = 0; @@ -1541,36 +1541,34 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) can_change_pmd_writable(vma, vmf->address, pmd)) writable = true; - page = vm_normal_page_pmd(vma, haddr, pmd); - if (!page) + folio = vm_normal_folio_pmd(vma, haddr, pmd); + if (!folio) goto out_map; /* See similar comment in do_numa_page for explanation */ if (!writable) flags |= TNF_NO_GROUP; - page_nid = page_to_nid(page); + nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ - if (node_is_toptier(page_nid)) - last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, haddr, page_nid, - &flags); - + if (node_is_toptier(nid)) + last_cpupid = page_cpupid_last(&folio->page); + target_nid = numa_migrate_prep(&folio->page, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { - put_page(page); + folio_put(folio); goto out_map; } spin_unlock(vmf->ptl); writable = false; - migrated = migrate_misplaced_folio(page_folio(page), vma, target_nid); + migrated = migrate_misplaced_folio(folio, vma, target_nid); if (migrated) { flags |= TNF_MIGRATED; - page_nid = target_nid; + nid = target_nid; } else { flags |= TNF_MIGRATE_FAIL; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1582,9 +1580,8 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, - flags); + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); return 0; -- cgit v1.2.3 From 6695cf68b15c215d33b8add64c33e01e3cbe236c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:14 +0800 Subject: mm: memory: use a folio in do_numa_page() Numa balancing only try to migrate non-compound page in do_numa_page(), use a folio in it to save several compound_head calls, note we use folio_estimated_sharers(), it is enough to check the folio sharers since only normal page is handled, if large folio numa balancing is supported, a precise folio sharers check would be used, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 311e862c6404..865741d9b6b9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4744,8 +4744,8 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL; - int page_nid = NUMA_NO_NODE; + struct folio *folio = NULL; + int nid = NUMA_NO_NODE; bool writable = false; int last_cpupid; int target_nid; @@ -4776,12 +4776,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) can_change_pte_writable(vma, vmf->address, pte)) writable = true; - page = vm_normal_page(vma, vmf->address, pte); - if (!page || is_zone_device_page(page)) + folio = vm_normal_folio(vma, vmf->address, pte); + if (!folio || folio_is_zone_device(folio)) goto out_map; /* TODO: handle PTE-mapped THP */ - if (PageCompound(page)) + if (folio_test_large(folio)) goto out_map; /* @@ -4796,34 +4796,34 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) flags |= TNF_NO_GROUP; /* - * Flag if the page is shared between multiple address spaces. This + * Flag if the folio is shared between multiple address spaces. This * is later used when determining whether to group tasks together */ - if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + if (folio_estimated_sharers(folio) > 1 && (vma->vm_flags & VM_SHARED)) flags |= TNF_SHARED; - page_nid = page_to_nid(page); + nid = folio_nid(folio); /* * For memory tiering mode, cpupid of slow memory page is used * to record page access time. So use default value. */ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && - !node_is_toptier(page_nid)) + !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(page); - target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, - &flags); + last_cpupid = page_cpupid_last(&folio->page); + target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, + &flags); if (target_nid == NUMA_NO_NODE) { - put_page(page); + folio_put(folio); goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); writable = false; /* Migrate to the requested node */ - if (migrate_misplaced_folio(page_folio(page), vma, target_nid)) { - page_nid = target_nid; + if (migrate_misplaced_folio(folio, vma, target_nid)) { + nid = target_nid; flags |= TNF_MIGRATED; } else { flags |= TNF_MIGRATE_FAIL; @@ -4839,8 +4839,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) } out: - if (page_nid != NUMA_NO_NODE) - task_numa_fault(last_cpupid, page_nid, 1, flags); + if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, 1, flags); return 0; out_map: /* -- cgit v1.2.3 From cda6d93672ac5dd8af778a3f3e6082e12233b65b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:15 +0800 Subject: mm: memory: make numa_migrate_prep() to take a folio In preparation for large folio numa balancing, make numa_migrate_prep() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/internal.h | 2 +- mm/memory.c | 9 ++++----- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5baf9b6dc522..aa0224556132 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1556,7 +1556,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) */ if (node_is_toptier(nid)) last_cpupid = page_cpupid_last(&folio->page); - target_nid = numa_migrate_prep(&folio->page, vma, haddr, nid, &flags); + target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; diff --git a/mm/internal.h b/mm/internal.h index 9e62c8b952b8..5a5c923725d3 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -983,7 +983,7 @@ void vunmap_range_noflush(unsigned long start, unsigned long end); void __vunmap_range_noflush(unsigned long start, unsigned long end); -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); void free_zone_device_page(struct page *page); diff --git a/mm/memory.c b/mm/memory.c index 865741d9b6b9..20b290c9dc87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4724,10 +4724,10 @@ static vm_fault_t do_fault(struct vm_fault *vmf) return ret; } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { - get_page(page); + folio_get(folio); /* Record the current PID acceesing VMA */ vma_set_access_pid_bit(vma); @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(page, vma, addr); + return mpol_misplaced(&folio->page, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) @@ -4812,8 +4812,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) last_cpupid = (-1 & LAST_CPUPID_MASK); else last_cpupid = page_cpupid_last(&folio->page); - target_nid = numa_migrate_prep(&folio->page, vma, vmf->address, nid, - &flags); + target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); goto out_map; -- cgit v1.2.3 From 75c70128a67311070115b90d826a229d4bbbb2b5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:16 +0800 Subject: mm: mempolicy: make mpol_misplaced() to take a folio In preparation for large folio numa balancing, make mpol_misplaced() to take a folio, no functional change intended. Link: https://lkml.kernel.org/r/20230921074417.24004-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 5 +++-- mm/memory.c | 2 +- mm/mempolicy.c | 22 ++++++++++++---------- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index d232de7cdc56..6c2754d7bfed 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -174,7 +174,7 @@ extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); -extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); +int mpol_misplaced(struct folio *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); static inline bool mpol_is_preferred_many(struct mempolicy *pol) @@ -278,7 +278,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol) } #endif -static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, +static inline int mpol_misplaced(struct folio *folio, + struct vm_area_struct *vma, unsigned long address) { return -1; /* no node preference */ diff --git a/mm/memory.c b/mm/memory.c index 20b290c9dc87..7ca16ed67634 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4738,7 +4738,7 @@ int numa_migrate_prep(struct folio *folio, struct vm_area_struct *vma, *flags |= TNF_FAULT_LOCAL; } - return mpol_misplaced(&folio->page, vma, addr); + return mpol_misplaced(folio, vma, addr); } static vm_fault_t do_numa_page(struct vm_fault *vmf) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f1b00d6ac7ee..69c0eac7292c 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2564,24 +2564,25 @@ static void sp_free(struct sp_node *n) } /** - * mpol_misplaced - check whether current page node is valid in policy + * mpol_misplaced - check whether current folio node is valid in policy * - * @page: page to be checked - * @vma: vm area where page mapped - * @addr: virtual address where page mapped + * @folio: folio to be checked + * @vma: vm area where folio mapped + * @addr: virtual address in @vma for shared policy lookup and interleave policy * - * Lookup current policy node id for vma,addr and "compare to" page's + * Lookup current policy node id for vma,addr and "compare to" folio's * node id. Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. * * Return: NUMA_NO_NODE if the page is in a node that is valid for this - * policy, or a suitable node ID to allocate a replacement page from. + * policy, or a suitable node ID to allocate a replacement folio from. */ -int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) +int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol; struct zoneref *z; - int curnid = page_to_nid(page); + int curnid = folio_nid(folio); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); @@ -2637,11 +2638,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long BUG(); } - /* Migrate the page towards the node whose CPU is referencing it */ + /* Migrate the folio towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) + if (!should_numa_migrate_memory(current, &folio->page, curnid, + thiscpu)) goto out; } -- cgit v1.2.3 From 8c9ae56dc73b5ae48a14000b96292bd4f2aeb710 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 21 Sep 2023 15:44:17 +0800 Subject: sched/numa, mm: make numa migrate functions to take a folio The cpupid (or access time) is stored in the head page for THP, so it is safely to make should_numa_migrate_memory() and numa_hint_fault_latency() to take a folio. This is in preparation for large folio numa balancing. Link: https://lkml.kernel.org/r/20230921074417.24004-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/sched/numa_balancing.h | 6 +++--- kernel/sched/fair.c | 12 ++++++------ mm/mempolicy.c | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..06a9d35650f0 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -20,8 +20,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); extern void task_numa_free(struct task_struct *p, bool final); -extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, - int src_nid, int dst_cpu); +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, + int src_nid, int dst_cpu); #else static inline void task_numa_fault(int last_node, int node, int pages, int flags) @@ -38,7 +38,7 @@ static inline void task_numa_free(struct task_struct *p, bool final) { } static inline bool should_numa_migrate_memory(struct task_struct *p, - struct page *page, int src_nid, int dst_cpu) + struct folio *folio, int src_nid, int dst_cpu) { return true; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cb225921bbca..42aefe7e6fdc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1722,12 +1722,12 @@ static bool pgdat_free_space_enough(struct pglist_data *pgdat) * The smaller the hint page fault latency, the higher the possibility * for the page to be hot. */ -static int numa_hint_fault_latency(struct page *page) +static int numa_hint_fault_latency(struct folio *folio) { int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(page, time); + last_time = xchg_page_access_time(&folio->page, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } @@ -1784,7 +1784,7 @@ static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, } } -bool should_numa_migrate_memory(struct task_struct *p, struct page * page, +bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { struct numa_group *ng = deref_curr_numa_group(p); @@ -1814,16 +1814,16 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); th = pgdat->nbp_threshold ? : def_th; - latency = numa_hint_fault_latency(page); + latency = numa_hint_fault_latency(folio); if (latency >= th) return false; return !numa_promotion_rate_limit(pgdat, rate_limit, - thp_nr_pages(page)); + folio_nr_pages(folio)); } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69c0eac7292c..abd94f4c7f6b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2642,7 +2642,7 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, if (pol->flags & MPOL_F_MORON) { polnid = thisnid; - if (!should_numa_migrate_memory(current, &folio->page, curnid, + if (!should_numa_migrate_memory(current, folio, curnid, thiscpu)) goto out; } -- cgit v1.2.3 From 987ffa5a3858bee448dc791cf6f596790aea52a8 Mon Sep 17 00:00:00 2001 From: Huan Yang Date: Wed, 20 Sep 2023 09:57:27 +0800 Subject: mm/damon/core: remove unnecessary si_meminfo invoke. si_meminfo() will read and assign more info not just free/ram pages. For just DAMOS_WMARK_FREE_MEM_RATE use, only get free and ram pages is ok to save cpu. Link: https://lkml.kernel.org/r/20230920015727.4482-1-link@vivo.com Signed-off-by: Huan Yang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 5eb649bd002f..9f4f7c378cf3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1326,12 +1326,10 @@ static bool kdamond_need_stop(struct damon_ctx *ctx) static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric) { - struct sysinfo i; - switch (metric) { case DAMOS_WMARK_FREE_MEM_RATE: - si_meminfo(&i); - return i.freeram * 1000 / i.totalram; + return global_zone_page_state(NR_FREE_PAGES) * 1000 / + totalram_pages(); default: break; } -- cgit v1.2.3 From 3c6f33b7273a7e2f2b2497b62c8400bd957b2fbe Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 22 Sep 2023 14:11:40 -0700 Subject: mm/ksm: support fork/exec for prctl Patch series "mm/ksm: add fork-exec support for prctl", v4. A process can enable KSM with the prctl system call. When the process is forked the KSM flag is inherited by the child process. However if the process is executing an exec system call directly after the fork, the KSM setting is cleared. This patch series addresses this problem. 1) Change the mask in coredump.h for execing a new process 2) Add a new test case in ksm_functional_tests This patch (of 2): Today we have two ways to enable KSM: 1) madvise system call This allows to enable KSM for a memory region for a long time. 2) prctl system call This is a recent addition to enable KSM for the complete process. In addition when a process is forked, the KSM setting is inherited. This change only affects the second case. One of the use cases for (2) was to support the ability to enable KSM for cgroups. This allows systemd to enable KSM for the seed process. By enabling it in the seed process all child processes inherit the setting. This works correctly when the process is forked. However it doesn't support fork/exec workflow. From the previous cover letter: .... Use case 3: With the madvise call sharing opportunities are only enabled for the current process: it is a workload-local decision. A considerable number of sharing opportunities may exist across multiple workloads or jobs (if they are part of the same security domain). Only a higler level entity like a job scheduler or container can know for certain if its running one or more instances of a job. That job scheduler however doesn't have the necessary internal workload knowledge to make targeted madvise calls. .... In addition it can also be a bit surprising that fork keeps the KSM setting and fork/exec does not. Link: https://lkml.kernel.org/r/20230922211141.320789-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230922211141.320789-2-shr@devkernel.io Signed-off-by: Stefan Roesch Fixes: d7597f59d1d3 ("mm: add new api to enable ksm per process") Reviewed-by: David Hildenbrand Reported-by: Carl Klemm Tested-by: Carl Klemm Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/sched/coredump.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 1b37fa8fc723..02f5090ffea2 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* @@ -85,13 +86,15 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_MDWE 28 #define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK) +#define MMF_HAS_MDWE_NO_INHERIT 29 -#define MMF_VM_MERGE_ANY 29 -#define MMF_HAS_MDWE_NO_INHERIT 30 +#define MMF_VM_MERGE_ANY 30 +#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ + MMF_VM_MERGE_ANY_MASK) static inline unsigned long mmf_init_flags(unsigned long flags) { -- cgit v1.2.3 From 0374af1da077573b2bea8ff70258d3537c5a1e79 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 22 Sep 2023 14:11:41 -0700 Subject: mm/ksm: test case for prctl fork/exec workflow This adds a new test case to the ksm functional tests to make sure that the KSM setting is inherited by the child process when doing a fork/exec. Link: https://lkml.kernel.org/r/20230922211141.320789-3-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Carl Klemm Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/ksm_functional_tests.c | 66 ++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 901e950f9138..fbff0dd09191 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -26,6 +26,7 @@ #define KiB 1024u #define MiB (1024 * KiB) +#define FORK_EXEC_CHILD_PRG_NAME "ksm_fork_exec_child" static int mem_fd; static int ksm_fd; @@ -479,6 +480,64 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } +static int ksm_fork_exec_child(void) +{ + /* Test if KSM is enabled for the process. */ + return prctl(PR_GET_MEMORY_MERGE, 0, 0, 0, 0) == 1; +} + +static void test_prctl_fork_exec(void) +{ + int ret, status; + pid_t child_pid; + + ksft_print_msg("[RUN] %s\n", __func__); + + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); + if (ret < 0 && errno == EINVAL) { + ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); + return; + } else if (ret) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n"); + return; + } + + child_pid = fork(); + if (child_pid == -1) { + ksft_test_result_skip("fork() failed\n"); + return; + } else if (child_pid == 0) { + char *prg_name = "./ksm_functional_tests"; + char *argv_for_program[] = { prg_name, FORK_EXEC_CHILD_PRG_NAME }; + + execv(prg_name, argv_for_program); + return; + } + + if (waitpid(child_pid, &status, 0) > 0) { + if (WIFEXITED(status)) { + status = WEXITSTATUS(status); + if (status) { + ksft_test_result_fail("KSM not enabled\n"); + return; + } + } else { + ksft_test_result_fail("program didn't terminate normally\n"); + return; + } + } else { + ksft_test_result_fail("waitpid() failed\n"); + return; + } + + if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) { + ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n"); + return; + } + + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); +} + static void test_prctl_unmerge(void) { const unsigned int size = 2 * MiB; @@ -536,9 +595,13 @@ unmap: int main(int argc, char **argv) { - unsigned int tests = 7; + unsigned int tests = 8; int err; + if (argc > 1 && !strcmp(argv[1], FORK_EXEC_CHILD_PRG_NAME)) { + exit(ksm_fork_exec_child() == 1 ? 0 : 1); + } + #ifdef __NR_userfaultfd tests++; #endif @@ -576,6 +639,7 @@ int main(int argc, char **argv) test_prctl(); test_prctl_fork(); + test_prctl_fork_exec(); test_prctl_unmerge(); err = ksft_get_fail_cnt(); -- cgit v1.2.3 From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 12:20:17 -0700 Subject: mm/filemap: remove hugetlb special casing in filemap.c Remove special cased hugetlb handling code within the page cache by changing the granularity of ->index to the base page size rather than the huge page size. The motivation of this patch is to reduce complexity within the filemap code while also increasing performance by removing branches that are evaluated on every page cache lookup. To support the change in index, new wrappers for hugetlb page cache interactions are added. These wrappers perform the conversion to a linear index which is now expected by the page cache for huge pages. ========================= PERFORMANCE ====================================== Perf was used to check the performance differences after the patch. Overall the performance is similar to mainline with a very small larger overhead that occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because of the larger overhead that occurs in xa_load() and xa_store() as the xarray is now using more entries to store hugetlb folios in the page cache. Timing aarch64 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt real 1m49.568s user 0m0.000s sys 1m49.461s 6.5-rc3: [root]# time fallocate -l 700GB test.txt real 1m47.495s user 0m0.000s sys 1m47.370s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m47.024s user 0m0.000s sys 1m46.921s 6.5-rc3: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m44.551s user 0m0.000s sys 1m44.438s x86 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt real 0m22.383s user 0m0.000s sys 0m22.255s 6.5-rc3: [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt real 0m22.735s user 0m0.038s sys 0m22.567s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt real 0m25.786s user 0m0.001s sys 0m25.589s 6.5-rc3: [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt real 0m33.454s user 0m0.001s sys 0m33.193s aarch64: workload - fallocate a 700GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--95.04%--__pi_clear_page | |--3.57%--clear_huge_page | | | |--2.63%--rcu_all_qs | | | --0.91%--__cond_resched | --0.67%--__cond_resched 0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache 0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio 6.5-rc3 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--94.91%--__pi_clear_page | |--4.11%--clear_huge_page | | | |--3.00%--rcu_all_qs | | | --1.10%--__cond_resched | --0.59%--__cond_resched 0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio x86 workload - fallocate a 100GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: hugetlbfs_fallocate | --99.57%--clear_huge_page | --98.47%--clear_page_erms | --0.53%--asm_sysvec_apic_timer_interrupt 0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store 6.5-rc3 2MB Page Size: --99.93%--__x64_sys_fallocate vfs_fallocate hugetlbfs_fallocate | --99.38%--clear_huge_page | |--98.40%--clear_page_erms | --0.59%--__cond_resched 0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio ========================= TESTING ====================================== This patch passes libhugetlbfs tests and LTP hugetlb tests ********** TEST SUMMARY * 2M * 32-bit 64-bit * Total testcases: 110 113 * Skipped: 0 0 * PASS: 107 113 * FAIL: 0 0 * Killed by signal: 3 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Test not present: 0 0 * Strange test result: 0 0 ********** Done executing testcases. LTP Version: 20220527-178-g2761a81c4 page migration was also tested using Mike Kravetz's test program.[8] [dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug] Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Dan Carpenter Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 37 +++++++++++++++++++------------------ include/linux/hugetlb.h | 12 ++++++++++++ include/linux/pagemap.h | 32 ++------------------------------ mm/filemap.c | 34 ++++++++++------------------------ mm/hugetlb.c | 32 ++++++-------------------------- mm/migrate.c | 6 +++--- 6 files changed, 52 insertions(+), 101 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 60fce26ff937..926d01c493fb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t retval = 0; while (iov_iter_count(to)) { - struct page *page; + struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ @@ -352,18 +352,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } nr = nr - offset; - /* Find the page */ - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + /* Find the folio */ + folio = filemap_lock_hugetlb_folio(h, mapping, index); + if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { - unlock_page(page); + folio_unlock(folio); - if (!PageHWPoison(page)) + if (!folio_test_has_hwpoisoned(folio)) want = nr; else { /* @@ -371,19 +371,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) * touching the 1st raw HWPOISON subpage after * offset. */ - want = adjust_range_hwpoison(page, offset, nr); + want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { - put_page(page); + folio_put(folio); retval = -EIO; break; } } /* - * We have the page, copy it to user space buffer. + * We have the folio, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, want, to); - put_page(page); + copied = copy_folio_to_iter(folio, offset, want, to); + folio_put(folio); } offset += copied; retval += copied; @@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> huge_page_shift(h); - const pgoff_t end = lend >> huge_page_shift(h); + const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); - next = start; + next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = folio->index; + index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } if (truncate_op) - (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); + (void)hugetlb_unreserve_pages(inode, + lstart >> huge_page_shift(h), + LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) @@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; @@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - folio = filemap_get_folio(mapping, index); + folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a30686e649f7..d935b0584f1d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) return huge_page_size(h) / 512; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return filemap_lock_folio(mapping, idx << huge_page_order(h)); +} + #include #ifndef is_hugepage_only_range @@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return NULL; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return NULL; +} + static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 351c3b7f93a1..759b29d9a69a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio) */ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return &folio->page; return folio_page(folio, index & (folio_nr_pages(folio) - 1)); } @@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return folio->index == index; return index - folio_index(folio) < folio_nr_pages(folio); } @@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, } /* - * Get index of the page within radix-tree (but not for hugetlb pages). - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + * Get the offset in PAGE_SIZE (even for hugetlb pages). */ -static inline pgoff_t page_to_index(struct page *page) +static inline pgoff_t page_to_pgoff(struct page *page) { struct page *head; @@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page) return head->index + page - head; } -extern pgoff_t hugetlb_basepage_index(struct page *page); - -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). - * (TODO: hugetlb pages should have ->index in PAGE_SIZE) - */ -static inline pgoff_t page_to_pgoff(struct page *page) -{ - if (unlikely(PageHuge(page))) - return hugetlb_basepage_index(page); - return page_to_index(page); -} - /* * Return byte-offset into filesystem object for page. */ @@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). - * (TODO: hugetlb folios should have ->index in PAGE_SIZE) */ static inline pgoff_t folio_pgoff(struct folio *folio) { - if (unlikely(folio_test_hugetlb(folio))) - return hugetlb_basepage_index(&folio->page); return folio->index; } -extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address); - static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; - if (unlikely(is_vm_hugetlb_page(vma))) - return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; diff --git a/mm/filemap.c b/mm/filemap.c index 9481ffaf24e6..340c693112a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping, mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by a single entry in the xarray */ - if (!folio_test_hugetlb(folio)) { - xas_set_order(&xas, folio->index, folio_order(folio)); - nr = folio_nr_pages(folio); - } + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio) if (free_folio) free_folio(folio); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } @@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, if (!huge) { int error = mem_cgroup_charge(folio, NULL, gfp); - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) return error; charged = true; - xas_set_order(&xas, index, folio_order(folio)); - nr = folio_nr_pages(folio); } + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); + gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; @@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2104,7 +2102,7 @@ put: int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2232,10 +2224,7 @@ update_start: if (nr) { folio = fbatch->folios[nr - 1]; - if (folio_test_hugetlb(folio)) - *start = folio->index + 1; - else - *start = folio_next_index(folio); + *start = folio->index + folio_nr_pages(folio); } out: rcu_read_unlock(); @@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de220e3ff8be..0aaf28ce32e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t) /* * Convert the address within this vma to the page offset within - * the mapping, in pagecache page units; huge pages here. + * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. @@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t hugetlb_basepage_index(struct page *page) -{ - struct page *page_head = compound_head(page); - pgoff_t index = page_index(page_head); - unsigned long compound_idx; - - if (compound_order(page_head) > MAX_ORDER) - compound_idx = page_to_pfn(page) - page_to_pfn(page_head); - else - compound_idx = page - page_head; - - return (index << compound_order(page_head)) + compound_idx; -} - static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, vma, address); + pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); @@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping struct hstate *h = hstate_inode(inode); int err; + idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); @@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_folio(mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (is_continue) { ret = -EFAULT; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; diff --git a/mm/migrate.c b/mm/migrate.c index 7d1804c4a5d9..942f8c9cd068 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -524,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; xas_lock_irq(&xas); - expected_count = 2 + folio_has_private(src); + expected_count = folio_expected_refs(mapping, src); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; @@ -533,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, dst->index = src->index; dst->mapping = src->mapping; - folio_get(dst); + folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); - folio_ref_unfreeze(src, expected_count - 1); + folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); -- cgit v1.2.3 From a48bf7b4757cd8de3497c2878536f46a8d2da65c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 10:44:33 -0700 Subject: mm/hugetlb: replace page_ref_freeze() with folio_ref_freeze() in hugetlb_folio_init_vmemmap() No functional difference, folio_ref_freeze() is currently a wrapper for page_ref_freeze(). Link: https://lkml.kernel.org/r/20230926174433.81241-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Muchun Song Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Usama Arif Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aaf28ce32e5..36b40bc9ac25 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3191,7 +3191,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, /* Prepare folio head */ __folio_clear_reserved(folio); __folio_set_head(folio); - ret = page_ref_freeze(&folio->page, 1); + ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); /* Initialize the necessary tail struct pages */ hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); -- cgit v1.2.3 From 07a8bdd4120ced3490ef9adf51b8086af0aaa8e7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:25 +0800 Subject: memory tiering: add abstract distance calculation algorithms management Patch series "memory tiering: calculate abstract distance based on ACPI HMAT", v4. We have the explicit memory tiers framework to manage systems with multiple types of memory, e.g., DRAM in DIMM slots and CXL memory devices. Where, same kind of memory devices will be grouped into memory types, then put into memory tiers. To describe the performance of a memory type, abstract distance is defined. Which is in direct proportion to the memory latency and inversely proportional to the memory bandwidth. To keep the code as simple as possible, fixed abstract distance is used in dax/kmem to describe slow memory such as Optane DCPMM. To support more memory types, in this series, we added the abstract distance calculation algorithm management mechanism, provided a algorithm implementation based on ACPI HMAT, and used the general abstract distance calculation interface in dax/kmem driver. So, dax/kmem can support HBM (high bandwidth memory) in addition to the original Optane DCPMM. This patch (of 4): The abstract distance may be calculated by various drivers, such as ACPI HMAT, CXL CDAT, etc. While it may be used by various code which hot-add memory node, such as dax/kmem etc. To decouple the algorithm users and the providers, the abstract distance calculation algorithms management mechanism is implemented in this patch. It provides interface for the providers to register the implementation, and interface for the users. Multiple algorithm implementations can cooperate via calculating abstract distance for different memory nodes. The preference of algorithm implementations can be specified via priority (notifier_block.priority). Link: https://lkml.kernel.org/r/20230926060628.265989-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230926060628.265989-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Alistair Popple Reviewed-by: Dave Jiang Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- include/linux/memory-tiers.h | 19 ++++++++++++++ mm/memory-tiers.c | 59 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 4fa178b50784..fddc59990644 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * Each tier cover a abstrace distance chunk size of 128 */ @@ -36,6 +37,9 @@ struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); +int register_mt_adistance_algorithm(struct notifier_block *nb); +int unregister_mt_adistance_algorithm(struct notifier_block *nb); +int mt_calc_adistance(int node, int *adist); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -97,5 +101,20 @@ static inline bool node_is_toptier(int node) { return true; } + +static inline int register_mt_adistance_algorithm(struct notifier_block *nb) +{ + return 0; +} + +static inline int unregister_mt_adistance_algorithm(struct notifier_block *nb) +{ + return 0; +} + +static inline int mt_calc_adistance(int node, int *adist) +{ + return NOTIFY_DONE; +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 876d8a5e210e..4301e7e89223 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "internal.h" @@ -105,6 +106,8 @@ static int top_tier_adistance; static struct demotion_nodes *node_demotion __read_mostly; #endif /* CONFIG_MIGRATION */ +static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); + static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); @@ -592,6 +595,62 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +/** + * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm + * @nb: The notifier block which describe the algorithm + * + * Return: 0 on success, errno on error. + * + * Every memory tiering abstract distance algorithm provider needs to + * register the algorithm with register_mt_adistance_algorithm(). To + * calculate the abstract distance for a specified memory node, the + * notifier function will be called unless some high priority + * algorithm has provided result. The prototype of the notifier + * function is as follows, + * + * int (*algorithm_notifier)(struct notifier_block *nb, + * unsigned long nid, void *data); + * + * Where "nid" specifies the memory node, "data" is the pointer to the + * returned abstract distance (that is, "int *adist"). If the + * algorithm provides the result, NOTIFY_STOP should be returned. + * Otherwise, return_value & %NOTIFY_STOP_MASK == 0 to allow the next + * algorithm in the chain to provide the result. + */ +int register_mt_adistance_algorithm(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&mt_adistance_algorithms, nb); +} +EXPORT_SYMBOL_GPL(register_mt_adistance_algorithm); + +/** + * unregister_mt_adistance_algorithm() - Unregister memory tiering abstract distance algorithm + * @nb: the notifier block which describe the algorithm + * + * Return: 0 on success, errno on error. + */ +int unregister_mt_adistance_algorithm(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&mt_adistance_algorithms, nb); +} +EXPORT_SYMBOL_GPL(unregister_mt_adistance_algorithm); + +/** + * mt_calc_adistance() - Calculate abstract distance with registered algorithms + * @node: the node to calculate abstract distance for + * @adist: the returned abstract distance + * + * Return: if return_value & %NOTIFY_STOP_MASK != 0, then some + * abstract distance algorithm provides the result, and return it via + * @adist. Otherwise, no algorithm can provide the result and @adist + * will be kept as it is. + */ +int mt_calc_adistance(int node, int *adist) +{ + return blocking_notifier_call_chain(&mt_adistance_algorithms, node, adist); +} +EXPORT_SYMBOL_GPL(mt_calc_adistance); + static int __meminit memtier_hotplug_callback(struct notifier_block *self, unsigned long action, void *_arg) { -- cgit v1.2.3 From d0376aac59a166cd7bd9d1a9768e31e71002631b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:26 +0800 Subject: acpi, hmat: refactor hmat_register_target_initiators() Previously, in hmat_register_target_initiators(), the performance attributes are calculated and the corresponding sysfs links and files are created too. Which is called during memory onlining. But now, to calculate the abstract distance of a memory target before memory onlining, we need to calculate the performance attributes for a memory target without creating sysfs links and files. To do that, hmat_register_target_initiators() is refactored to make it possible to calculate performance attributes separately. Link: https://lkml.kernel.org/r/20230926060628.265989-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Alistair Popple Tested-by: Alistair Popple Tested-by: Bharata B Rao Reviewed-by: Dave Jiang Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 81 ++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 51 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index bba268ecd802..2dee0098f1a9 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -582,28 +582,25 @@ static int initiators_to_nodemask(unsigned long *p_nodes) return 0; } -static void hmat_register_target_initiators(struct memory_target *target) +static void hmat_update_target_attrs(struct memory_target *target, + unsigned long *p_nodes, int access) { - static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); struct memory_initiator *initiator; - unsigned int mem_nid, cpu_nid; + unsigned int cpu_nid; struct memory_locality *loc = NULL; u32 best = 0; - bool access0done = false; int i; - mem_nid = pxm_to_node(target->memory_pxm); + bitmap_zero(p_nodes, MAX_NUMNODES); /* - * If the Address Range Structure provides a local processor pxm, link + * If the Address Range Structure provides a local processor pxm, set * only that one. Otherwise, find the best performance attributes and - * register all initiators that match. + * collect all initiators that match. */ if (target->processor_pxm != PXM_INVAL) { cpu_nid = pxm_to_node(target->processor_pxm); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - access0done = true; - if (node_state(cpu_nid, N_CPU)) { - register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + if (access == 0 || node_state(cpu_nid, N_CPU)) { + set_bit(target->processor_pxm, p_nodes); return; } } @@ -617,47 +614,10 @@ static void hmat_register_target_initiators(struct memory_target *target) * We'll also use the sorting to prime the candidate nodes with known * initiators. */ - bitmap_zero(p_nodes, MAX_NUMNODES); list_sort(NULL, &initiators, initiator_cmp); if (initiators_to_nodemask(p_nodes) < 0) return; - if (!access0done) { - for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { - loc = localities_types[i]; - if (!loc) - continue; - - best = 0; - list_for_each_entry(initiator, &initiators, node) { - u32 value; - - if (!test_bit(initiator->processor_pxm, p_nodes)) - continue; - - value = hmat_initiator_perf(target, initiator, - loc->hmat_loc); - if (hmat_update_best(loc->hmat_loc->data_type, value, &best)) - bitmap_clear(p_nodes, 0, initiator->processor_pxm); - if (value != best) - clear_bit(initiator->processor_pxm, p_nodes); - } - if (best) - hmat_update_target_access(target, loc->hmat_loc->data_type, - best, 0); - } - - for_each_set_bit(i, p_nodes, MAX_NUMNODES) { - cpu_nid = pxm_to_node(i); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 0); - } - } - - /* Access 1 ignores Generic Initiators */ - bitmap_zero(p_nodes, MAX_NUMNODES); - if (initiators_to_nodemask(p_nodes) < 0) - return; - for (i = WRITE_LATENCY; i <= READ_BANDWIDTH; i++) { loc = localities_types[i]; if (!loc) @@ -667,7 +627,7 @@ static void hmat_register_target_initiators(struct memory_target *target) list_for_each_entry(initiator, &initiators, node) { u32 value; - if (!initiator->has_cpu) { + if (access == 1 && !initiator->has_cpu) { clear_bit(initiator->processor_pxm, p_nodes); continue; } @@ -681,14 +641,33 @@ static void hmat_register_target_initiators(struct memory_target *target) clear_bit(initiator->processor_pxm, p_nodes); } if (best) - hmat_update_target_access(target, loc->hmat_loc->data_type, best, 1); + hmat_update_target_access(target, loc->hmat_loc->data_type, best, access); } +} + +static void __hmat_register_target_initiators(struct memory_target *target, + unsigned long *p_nodes, + int access) +{ + unsigned int mem_nid, cpu_nid; + int i; + + mem_nid = pxm_to_node(target->memory_pxm); + hmat_update_target_attrs(target, p_nodes, access); for_each_set_bit(i, p_nodes, MAX_NUMNODES) { cpu_nid = pxm_to_node(i); - register_memory_node_under_compute_node(mem_nid, cpu_nid, 1); + register_memory_node_under_compute_node(mem_nid, cpu_nid, access); } } +static void hmat_register_target_initiators(struct memory_target *target) +{ + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); + + __hmat_register_target_initiators(target, p_nodes, 0); + __hmat_register_target_initiators(target, p_nodes, 1); +} + static void hmat_register_target_cache(struct memory_target *target) { unsigned mem_nid = pxm_to_node(target->memory_pxm); -- cgit v1.2.3 From 3718c02dbd4c88d47b5af003acdb3d1112604ea3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:27 +0800 Subject: acpi, hmat: calculate abstract distance with HMAT A memory tiering abstract distance calculation algorithm based on ACPI HMAT is implemented. The basic idea is as follows. The performance attributes of system default DRAM nodes are recorded as the base line. Whose abstract distance is MEMTIER_ADISTANCE_DRAM. Then, the ratio of the abstract distance of a memory node (target) to MEMTIER_ADISTANCE_DRAM is scaled based on the ratio of the performance attributes of the node to that of the default DRAM nodes. The functions to record the read/write latency/bandwidth of the default DRAM nodes and calculate abstract distance according to read/write latency/bandwidth ratio will be used by CXL CDAT (Coherent Device Attribute Table) and other memory device drivers. So, they are put in memory-tiers.c. Link: https://lkml.kernel.org/r/20230926060628.265989-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Dave Jiang Reviewed-by: Alistair Popple Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- drivers/acpi/numa/hmat.c | 65 ++++++++++++++++++++++++++- include/linux/memory-tiers.h | 18 ++++++++ mm/memory-tiers.c | 103 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 183 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/numa/hmat.c b/drivers/acpi/numa/hmat.c index 2dee0098f1a9..9ef5f1bdcfdb 100644 --- a/drivers/acpi/numa/hmat.c +++ b/drivers/acpi/numa/hmat.c @@ -24,6 +24,7 @@ #include #include #include +#include static u8 hmat_revision; static int hmat_disable __initdata; @@ -759,6 +760,61 @@ static int hmat_callback(struct notifier_block *self, return NOTIFY_OK; } +static int hmat_set_default_dram_perf(void) +{ + int rc; + int nid, pxm; + struct memory_target *target; + struct node_hmem_attrs *attrs; + + if (!default_dram_type) + return -EIO; + + for_each_node_mask(nid, default_dram_type->nodes) { + pxm = node_to_pxm(nid); + target = find_mem_target(pxm); + if (!target) + continue; + attrs = &target->hmem_attrs[1]; + rc = mt_set_default_dram_perf(nid, attrs, "ACPI HMAT"); + if (rc) + return rc; + } + + return 0; +} + +static int hmat_calculate_adistance(struct notifier_block *self, + unsigned long nid, void *data) +{ + static DECLARE_BITMAP(p_nodes, MAX_NUMNODES); + struct memory_target *target; + struct node_hmem_attrs *perf; + int *adist = data; + int pxm; + + pxm = node_to_pxm(nid); + target = find_mem_target(pxm); + if (!target) + return NOTIFY_OK; + + mutex_lock(&target_lock); + hmat_update_target_attrs(target, p_nodes, 1); + mutex_unlock(&target_lock); + + perf = &target->hmem_attrs[1]; + + if (mt_perf_to_adistance(perf, adist)) + return NOTIFY_OK; + + return NOTIFY_STOP; +} + +static struct notifier_block hmat_adist_nb __meminitdata = { + .notifier_call = hmat_calculate_adistance, + .priority = 100, +}; + static __init void hmat_free_structures(void) { struct memory_target *target, *tnext; @@ -841,8 +897,13 @@ static __init int hmat_init(void) hmat_register_targets(); /* Keep the table and structures if the notifier may use them */ - if (!hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI)) - return 0; + if (hotplug_memory_notifier(hmat_callback, HMAT_CALLBACK_PRI)) + goto out_put; + + if (!hmat_set_default_dram_perf()) + register_mt_adistance_algorithm(&hmat_adist_nb); + + return 0; out_put: hmat_free_structures(); acpi_put_table(tbl); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index fddc59990644..59d15c6d3c0d 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -31,8 +31,11 @@ struct memory_dev_type { struct kref kref; }; +struct node_hmem_attrs; + #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; +extern struct memory_dev_type *default_dram_type; struct memory_dev_type *alloc_memory_type(int adistance); void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); @@ -40,6 +43,9 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype); int register_mt_adistance_algorithm(struct notifier_block *nb); int unregister_mt_adistance_algorithm(struct notifier_block *nb); int mt_calc_adistance(int node, int *adist); +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, + const char *source); +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist); #ifdef CONFIG_MIGRATION int next_demotion_node(int node); void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets); @@ -64,6 +70,7 @@ static inline bool node_is_toptier(int node) #else #define numa_demotion_enabled false +#define default_dram_type NULL /* * CONFIG_NUMA implementation returns non NULL error. */ @@ -116,5 +123,16 @@ static inline int mt_calc_adistance(int node, int *adist) { return NOTIFY_DONE; } + +static inline int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, + const char *source) +{ + return -EIO; +} + +static inline int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +{ + return -EIO; +} #endif /* CONFIG_NUMA */ #endif /* _LINUX_MEMORY_TIERS_H */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 4301e7e89223..085321c77123 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -37,7 +37,7 @@ struct node_memory_type_map { static DEFINE_MUTEX(memory_tier_lock); static LIST_HEAD(memory_tiers); static struct node_memory_type_map node_memory_types[MAX_NUMNODES]; -static struct memory_dev_type *default_dram_type; +struct memory_dev_type *default_dram_type; static struct bus_type memory_tier_subsys = { .name = "memory_tiering", @@ -108,6 +108,11 @@ static struct demotion_nodes *node_demotion __read_mostly; static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms); +static bool default_dram_perf_error; +static struct node_hmem_attrs default_dram_perf; +static int default_dram_perf_ref_nid = NUMA_NO_NODE; +static const char *default_dram_perf_ref_source; + static inline struct memory_tier *to_memory_tier(struct device *device) { return container_of(device, struct memory_tier, dev); @@ -595,6 +600,102 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) } EXPORT_SYMBOL_GPL(clear_node_memory_type); +static void dump_hmem_attrs(struct node_hmem_attrs *attrs, const char *prefix) +{ + pr_info( +"%sread_latency: %u, write_latency: %u, read_bandwidth: %u, write_bandwidth: %u\n", + prefix, attrs->read_latency, attrs->write_latency, + attrs->read_bandwidth, attrs->write_bandwidth); +} + +int mt_set_default_dram_perf(int nid, struct node_hmem_attrs *perf, + const char *source) +{ + int rc = 0; + + mutex_lock(&memory_tier_lock); + if (default_dram_perf_error) { + rc = -EIO; + goto out; + } + + if (perf->read_latency + perf->write_latency == 0 || + perf->read_bandwidth + perf->write_bandwidth == 0) { + rc = -EINVAL; + goto out; + } + + if (default_dram_perf_ref_nid == NUMA_NO_NODE) { + default_dram_perf = *perf; + default_dram_perf_ref_nid = nid; + default_dram_perf_ref_source = kstrdup(source, GFP_KERNEL); + goto out; + } + + /* + * The performance of all default DRAM nodes is expected to be + * same (that is, the variation is less than 10%). And it + * will be used as base to calculate the abstract distance of + * other memory nodes. + */ + if (abs(perf->read_latency - default_dram_perf.read_latency) * 10 > + default_dram_perf.read_latency || + abs(perf->write_latency - default_dram_perf.write_latency) * 10 > + default_dram_perf.write_latency || + abs(perf->read_bandwidth - default_dram_perf.read_bandwidth) * 10 > + default_dram_perf.read_bandwidth || + abs(perf->write_bandwidth - default_dram_perf.write_bandwidth) * 10 > + default_dram_perf.write_bandwidth) { + pr_info( +"memory-tiers: the performance of DRAM node %d mismatches that of the reference\n" +"DRAM node %d.\n", nid, default_dram_perf_ref_nid); + pr_info(" performance of reference DRAM node %d:\n", + default_dram_perf_ref_nid); + dump_hmem_attrs(&default_dram_perf, " "); + pr_info(" performance of DRAM node %d:\n", nid); + dump_hmem_attrs(perf, " "); + pr_info( +" disable default DRAM node performance based abstract distance algorithm.\n"); + default_dram_perf_error = true; + rc = -EINVAL; + } + +out: + mutex_unlock(&memory_tier_lock); + return rc; +} + +int mt_perf_to_adistance(struct node_hmem_attrs *perf, int *adist) +{ + if (default_dram_perf_error) + return -EIO; + + if (default_dram_perf_ref_nid == NUMA_NO_NODE) + return -ENOENT; + + if (perf->read_latency + perf->write_latency == 0 || + perf->read_bandwidth + perf->write_bandwidth == 0) + return -EINVAL; + + mutex_lock(&memory_tier_lock); + /* + * The abstract distance of a memory node is in direct proportion to + * its memory latency (read + write) and inversely proportional to its + * memory bandwidth (read + write). The abstract distance, memory + * latency, and memory bandwidth of the default DRAM nodes are used as + * the base. + */ + *adist = MEMTIER_ADISTANCE_DRAM * + (perf->read_latency + perf->write_latency) / + (default_dram_perf.read_latency + default_dram_perf.write_latency) * + (default_dram_perf.read_bandwidth + default_dram_perf.write_bandwidth) / + (perf->read_bandwidth + perf->write_bandwidth); + mutex_unlock(&memory_tier_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(mt_perf_to_adistance); + /** * register_mt_adistance_algorithm() - Register memory tiering abstract distance algorithm * @nb: The notifier block which describe the algorithm -- cgit v1.2.3 From 6bc2cfdf82d56863b7cf5e86e37a662b2ae5d47e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 26 Sep 2023 14:06:28 +0800 Subject: dax, kmem: calculate abstract distance with general interface Previously, a fixed abstract distance MEMTIER_DEFAULT_DAX_ADISTANCE is used for slow memory type in kmem driver. This limits the usage of kmem driver, for example, it cannot be used for HBM (high bandwidth memory). So, we use the general abstract distance calculation mechanism in kmem drivers to get more accurate abstract distance on systems with proper support. The original MEMTIER_DEFAULT_DAX_ADISTANCE is used as fallback only. Now, multiple memory types may be managed by kmem. These memory types are put into the "kmem_memory_types" list and protected by kmem_memory_type_lock. Link: https://lkml.kernel.org/r/20230926060628.265989-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Tested-by: Bharata B Rao Reviewed-by: Dave Jiang Reviewed-by: Alistair Popple Cc: Aneesh Kumar K.V Cc: Wei Xu Cc: Dan Williams Cc: Dave Hansen Cc: Davidlohr Bueso Cc: Johannes Weiner Cc: Jonathan Cameron Cc: Michal Hocko Cc: Yang Shi Cc: Rafael J Wysocki Signed-off-by: Andrew Morton --- drivers/dax/kmem.c | 62 ++++++++++++++++++++++++++++++++++---------- include/linux/memory-tiers.h | 2 ++ mm/memory-tiers.c | 3 ++- 3 files changed, 53 insertions(+), 14 deletions(-) diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index c57acb73e3db..369c698b7706 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -49,14 +49,52 @@ struct dax_kmem_data { struct resource *res[]; }; -static struct memory_dev_type *dax_slowmem_type; +static DEFINE_MUTEX(kmem_memory_type_lock); +static LIST_HEAD(kmem_memory_types); + +static struct memory_dev_type *kmem_find_alloc_memory_type(int adist) +{ + bool found = false; + struct memory_dev_type *mtype; + + mutex_lock(&kmem_memory_type_lock); + list_for_each_entry(mtype, &kmem_memory_types, list) { + if (mtype->adistance == adist) { + found = true; + break; + } + } + if (!found) { + mtype = alloc_memory_type(adist); + if (!IS_ERR(mtype)) + list_add(&mtype->list, &kmem_memory_types); + } + mutex_unlock(&kmem_memory_type_lock); + + return mtype; +} + +static void kmem_put_memory_types(void) +{ + struct memory_dev_type *mtype, *mtn; + + mutex_lock(&kmem_memory_type_lock); + list_for_each_entry_safe(mtype, mtn, &kmem_memory_types, list) { + list_del(&mtype->list); + put_memory_type(mtype); + } + mutex_unlock(&kmem_memory_type_lock); +} + static int dev_dax_kmem_probe(struct dev_dax *dev_dax) { struct device *dev = &dev_dax->dev; unsigned long total_len = 0; struct dax_kmem_data *data; + struct memory_dev_type *mtype; int i, rc, mapped = 0; int numa_node; + int adist = MEMTIER_DEFAULT_DAX_ADISTANCE; /* * Ensure good NUMA information for the persistent memory. @@ -71,6 +109,11 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } + mt_calc_adistance(numa_node, &adist); + mtype = kmem_find_alloc_memory_type(adist); + if (IS_ERR(mtype)) + return PTR_ERR(mtype); + for (i = 0; i < dev_dax->nr_range; i++) { struct range range; @@ -88,7 +131,7 @@ static int dev_dax_kmem_probe(struct dev_dax *dev_dax) return -EINVAL; } - init_node_memory_type(numa_node, dax_slowmem_type); + init_node_memory_type(numa_node, mtype); rc = -ENOMEM; data = kzalloc(struct_size(data, res, dev_dax->nr_range), GFP_KERNEL); @@ -167,7 +210,7 @@ err_reg_mgid: err_res_name: kfree(data); err_dax_kmem_data: - clear_node_memory_type(numa_node, dax_slowmem_type); + clear_node_memory_type(numa_node, mtype); return rc; } @@ -219,7 +262,7 @@ static void dev_dax_kmem_remove(struct dev_dax *dev_dax) * for that. This implies this reference will be around * till next reboot. */ - clear_node_memory_type(node, dax_slowmem_type); + clear_node_memory_type(node, NULL); } } #else @@ -251,12 +294,6 @@ static int __init dax_kmem_init(void) if (!kmem_name) return -ENOMEM; - dax_slowmem_type = alloc_memory_type(MEMTIER_DEFAULT_DAX_ADISTANCE); - if (IS_ERR(dax_slowmem_type)) { - rc = PTR_ERR(dax_slowmem_type); - goto err_dax_slowmem_type; - } - rc = dax_driver_register(&device_dax_kmem_driver); if (rc) goto error_dax_driver; @@ -264,8 +301,7 @@ static int __init dax_kmem_init(void) return rc; error_dax_driver: - put_memory_type(dax_slowmem_type); -err_dax_slowmem_type: + kmem_put_memory_types(); kfree_const(kmem_name); return rc; } @@ -275,7 +311,7 @@ static void __exit dax_kmem_exit(void) dax_driver_unregister(&device_dax_kmem_driver); if (!any_hotremove_failed) kfree_const(kmem_name); - put_memory_type(dax_slowmem_type); + kmem_put_memory_types(); } MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index 59d15c6d3c0d..1e39d27bee41 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -24,6 +24,8 @@ struct memory_tier; struct memory_dev_type { /* list of memory types that are part of same tier as this type */ struct list_head tier_sibling; + /* list of memory types that are managed by one driver */ + struct list_head list; /* abstract distance for this specific memory type */ int adistance; /* Nodes of same abstract distance */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 085321c77123..8d5291add2bc 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -586,13 +586,14 @@ EXPORT_SYMBOL_GPL(init_node_memory_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype) { mutex_lock(&memory_tier_lock); - if (node_memory_types[node].memtype == memtype) + if (node_memory_types[node].memtype == memtype || !memtype) node_memory_types[node].map_count--; /* * If we umapped all the attached devices to this node, * clear the node memory type. */ if (!node_memory_types[node].map_count) { + memtype = node_memory_types[node].memtype; node_memory_types[node].memtype = NULL; put_memory_type(memtype); } -- cgit v1.2.3 From 5e924ff54d088828794d9f1a4d5bf17808f7270e Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:36 -0700 Subject: mm/ksm: add "smart" page scanning mode Patch series "Smart scanning mode for KSM", v3. This patch series adds "smart scanning" for KSM. What is smart scanning? ======================= KSM evaluates all the candidate pages for each scan. It does not use historic information from previous scans. This has the effect that candidate pages that couldn't be used for KSM de-duplication continue to be evaluated for each scan. The idea of "smart scanning" is to keep historic information. With the historic information we can temporarily skip the candidate page for one or several scans. Details: ======== "Smart scanning" is to keep two small counters to store if the page has been used for KSM. One counter stores how often we already tried to use the page for KSM and the other counter stores how often we skip a page. How often we skip the candidate page depends how often a page failed KSM de-duplication. The code skips a maximum of 8 times. During testing this has shown to be a good compromise for different workloads. New sysfs knob: =============== Smart scanning is not enabled by default. With /sys/kernel/mm/ksm/smart_scan smart scanning can be enabled. Monitoring: =========== To monitor how effective smart scanning is a new sysfs knob has been introduced. /sys/kernel/mm/pages_skipped report how many pages have been skipped by smart scanning. Results: ======== - Various workloads have shown a 20% - 25% reduction in page scans For the instagram workload for instance, the number of pages scanned has been reduced from over 20M pages per scan to less than 15M pages. - Less pages scans also resulted in an overall higher de-duplication rate as some shorter lived pages could be de-duplicated additionally - Less pages scanned allows to reduce the pages_to_scan parameter and this resulted in a 25% reduction in terms of CPU. - The improvements have been observed for workloads that enable KSM with madvise as well as prctl This patch (of 4): This change adds a "smart" page scanning mode for KSM. So far all the candidate pages are continuously scanned to find candidates for de-duplication. There are a considerably number of pages that cannot be de-duplicated. This is costly in terms of CPU. By using smart scanning considerable CPU savings can be achieved. This change takes the history of scanning pages into account and skips the page scanning of certain pages for a while if de-deduplication for this page has not been successful in the past. To do this it introduces two new fields in the ksm_rmap_item structure: age and remaining_skips. age, is the KSM age and remaining_skips determines how often scanning of this page is skipped. The age field is incremented each time the page is scanned and the page cannot be de- duplicated. age updated is capped at U8_MAX. How often a page is skipped is dependent how often de-duplication has been tried so far and the number of skips is currently limited to 8. This value has shown to be effective with different workloads. The feature is currently disable by default and can be enabled with the new smart_scan knob. The feature has shown to be very effective: upt to 25% of the page scans can be eliminated; the pages_to_scan rate can be reduced by 40 - 50% and a similar de-duplication rate can be maintained. [akpm@linux-foundation.org: make ksm_smart_scan default true, for testing] Link: https://lkml.kernel.org/r/20230926040939.516161-1-shr@devkernel.io Link: https://lkml.kernel.org/r/20230926040939.516161-2-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Cc: Stefan Roesch Signed-off-by: Andrew Morton --- mm/ksm.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 981af9c72e7a..8ad66b1c91b9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -56,6 +56,8 @@ #define DO_NUMA(x) do { } while (0) #endif +typedef u8 rmap_age_t; + /** * DOC: Overview * @@ -193,6 +195,8 @@ struct ksm_stable_node { * @node: rb node of this rmap_item in the unstable tree * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node + * @age: number of scan iterations since creation + * @remaining_skips: how many scans to skip */ struct ksm_rmap_item { struct ksm_rmap_item *rmap_list; @@ -205,6 +209,8 @@ struct ksm_rmap_item { struct mm_struct *mm; unsigned long address; /* + low bits used for flags below */ unsigned int oldchecksum; /* when unstable */ + rmap_age_t age; + rmap_age_t remaining_skips; union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ @@ -281,6 +287,10 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* Skip pages that couldn't be de-duplicated previously */ +/* Default to true at least temporarily, for testing */ +static bool ksm_smart_scan = true; + /* The number of zero pages which is placed by KSM */ unsigned long ksm_zero_pages; @@ -2305,6 +2315,73 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, return rmap_item; } +/* + * Calculate skip age for the ksm page age. The age determines how often + * de-duplicating has already been tried unsuccessfully. If the age is + * smaller, the scanning of this page is skipped for less scans. + * + * @age: rmap_item age of page + */ +static unsigned int skip_age(rmap_age_t age) +{ + if (age <= 3) + return 1; + if (age <= 5) + return 2; + if (age <= 8) + return 4; + + return 8; +} + +/* + * Determines if a page should be skipped for the current scan. + * + * @page: page to check + * @rmap_item: associated rmap_item of page + */ +static bool should_skip_rmap_item(struct page *page, + struct ksm_rmap_item *rmap_item) +{ + rmap_age_t age; + + if (!ksm_smart_scan) + return false; + + /* + * Never skip pages that are already KSM; pages cmp_and_merge_page() + * will essentially ignore them, but we still have to process them + * properly. + */ + if (PageKsm(page)) + return false; + + age = rmap_item->age; + if (age != U8_MAX) + rmap_item->age++; + + /* + * Smaller ages are not skipped, they need to get a chance to go + * through the different phases of the KSM merging. + */ + if (age < 3) + return false; + + /* + * Are we still allowed to skip? If not, then don't skip it + * and determine how much more often we are allowed to skip next. + */ + if (!rmap_item->remaining_skips) { + rmap_item->remaining_skips = skip_age(age); + return false; + } + + /* Skip this page */ + rmap_item->remaining_skips--; + remove_rmap_item_from_tree(rmap_item); + return true; +} + static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; @@ -2409,6 +2486,10 @@ next_mm: if (rmap_item) { ksm_scan.rmap_list = &rmap_item->rmap_list; + + if (should_skip_rmap_item(*page, rmap_item)) + goto next_page; + ksm_scan.address += PAGE_SIZE; } else put_page(*page); @@ -3449,6 +3530,28 @@ static ssize_t full_scans_show(struct kobject *kobj, } KSM_ATTR_RO(full_scans); +static ssize_t smart_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%u\n", ksm_smart_scan); +} + +static ssize_t smart_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + bool value; + + err = kstrtobool(buf, &value); + if (err) + return -EINVAL; + + ksm_smart_scan = value; + return count; +} +KSM_ATTR(smart_scan); + static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, @@ -3469,6 +3572,7 @@ static struct attribute *ksm_attrs[] = { &stable_node_chains_prune_millisecs_attr.attr, &use_zero_pages_attr.attr, &general_profit_attr.attr, + &smart_scan_attr.attr, NULL, }; -- cgit v1.2.3 From e5a68991268906a6989f1bf273580c2218662ad6 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:37 -0700 Subject: mm/ksm: add pages_skipped metric This change adds the "pages skipped" metric. To be able to evaluate how successful smart page scanning is, the pages skipped metric can be compared to the pages scanned metric. The pages skipped metric is a cumulative counter. The counter is stored under /sys/kernel/mm/ksm/pages_skipped. Link: https://lkml.kernel.org/r/20230926040939.516161-3-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/ksm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/ksm.c b/mm/ksm.c index 8ad66b1c91b9..7efcc68ccc6e 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -294,6 +294,9 @@ static bool ksm_smart_scan = true; /* The number of zero pages which is placed by KSM */ unsigned long ksm_zero_pages; +/* The number of pages that have been skipped due to "smart scanning" */ +static unsigned long ksm_pages_skipped; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -2377,6 +2380,7 @@ static bool should_skip_rmap_item(struct page *page, } /* Skip this page */ + ksm_pages_skipped++; rmap_item->remaining_skips--; remove_rmap_item_from_tree(rmap_item); return true; @@ -3464,6 +3468,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t pages_skipped_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_skipped); +} +KSM_ATTR_RO(pages_skipped); + static ssize_t ksm_zero_pages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3561,6 +3572,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &pages_skipped_attr.attr, &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA -- cgit v1.2.3 From 75d7dd4138edd54f3b539698cb4c78a8494d305c Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:38 -0700 Subject: mm/ksm: document smart scan mode This adds documentation for the smart scan mode of KSM. [akpm@linux-foundation.org: fix typo] [akpm@linux-foundation.org: document that smart_scan defaults to on] Link: https://lkml.kernel.org/r/20230926040939.516161-4-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 776f244bdae4..9d235c1bbbf2 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -155,6 +155,15 @@ stable_node_chains_prune_millisecs scan. It's a noop if not a single KSM page hit the ``max_page_sharing`` yet. +smart_scan + Historically KSM checked every candidate page for each scan. It did + not take into account historic information. When smart scan is + enabled, pages that have previously not been de-duplicated get + skipped. How often these pages are skipped depends on how often + de-duplication has already been tried and failed. By default this + optimization is enabled. The ``pages_skipped`` metric shows how + effective the setting is. + The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``: general_profit -- cgit v1.2.3 From b0540208a59e11ab55c9b857bf521d8846e515bf Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Mon, 25 Sep 2023 21:09:39 -0700 Subject: mm/ksm: document pages_skipped sysfs knob This adds documentation for the new metric pages_skipped. Link: https://lkml.kernel.org/r/20230926040939.516161-5-shr@devkernel.io Signed-off-by: Stefan Roesch Reviewed-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 9d235c1bbbf2..e59231ac6bb7 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -178,6 +178,8 @@ pages_unshared how many pages unique but repeatedly checked for merging pages_volatile how many pages changing too fast to be placed in a tree +pages_skipped + how many pages did the "smart" page scanning algorithm skip full_scans how many times all mergeable areas have been scanned stable_node_chains -- cgit v1.2.3 From fc7f04dc23db50206bee7891516ed4726c3f64cf Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Tue, 11 Jul 2023 17:13:34 +0800 Subject: selftests/clone3: Fix broken test under !CONFIG_TIME_NS When execute the following command to test clone3 under !CONFIG_TIME_NS: # make headers && cd tools/testing/selftests/clone3 && make && ./clone3 we can see the following error info: # [7538] Trying clone3() with flags 0x80 (size 0) # Invalid argument - Failed to create new process # [7538] clone3() with flags says: -22 expected 0 not ok 18 [7538] Result (-22) is different than expected (0) ... # Totals: pass:18 fail:1 xfail:0 xpass:0 skip:0 error:0 This is because if CONFIG_TIME_NS is not set, but the flag CLONE_NEWTIME (0x80) is used to clone a time namespace, it will return -EINVAL in copy_time_ns(). If kernel does not support CONFIG_TIME_NS, /proc/self/ns/time will be not exist, and then we should skip clone3() test with CLONE_NEWTIME. With this patch under !CONFIG_TIME_NS: # make headers && cd tools/testing/selftests/clone3 && make && ./clone3 ... # Time namespaces are not supported ok 18 # SKIP Skipping clone3() with CLONE_NEWTIME ... # Totals: pass:18 fail:0 xfail:0 xpass:0 skip:1 error:0 Link: https://lkml.kernel.org/r/1689066814-13295-1-git-send-email-yangtiezhu@loongson.cn Fixes: 515bddf0ec41 ("selftests/clone3: test clone3 with CLONE_NEWTIME") Signed-off-by: Tiezhu Yang Suggested-by: Thomas Gleixner Cc: Christian Brauner Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/clone3/clone3.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index e60cf4da8fb0..1c61e3c022cb 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -196,7 +196,12 @@ int main(int argc, char *argv[]) CLONE3_ARGS_NO_TEST); /* Do a clone3() in a new time namespace */ - test_clone3(CLONE_NEWTIME, 0, 0, CLONE3_ARGS_NO_TEST); + if (access("/proc/self/ns/time", F_OK) == 0) { + test_clone3(CLONE_NEWTIME, 0, 0, CLONE3_ARGS_NO_TEST); + } else { + ksft_print_msg("Time namespaces are not supported\n"); + ksft_test_result_skip("Skipping clone3() with CLONE_NEWTIME\n"); + } /* Do a clone3() with exit signal (SIGCHLD) in flags */ test_clone3(SIGCHLD, 0, -EINVAL, CLONE3_ARGS_NO_TEST); -- cgit v1.2.3 From 30a89adf872d2e46323840964c95dc0ae3bb5843 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 16 Oct 2023 19:55:49 -0700 Subject: hugetlb: check for hugetlb folio before vmemmap_restore In commit d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") checks were added to print a warning if hugetlb_vmemmap_restore was called on a non-hugetlb page. This was mostly due to ordering issues in the hugetlb page set up and tear down sequencees. One place missed was the routine dissolve_free_huge_page. Naoya Horiguchi noted: "I saw that VM_WARN_ON_ONCE() in hugetlb_vmemmap_restore is triggered when memory_failure() is called on a free hugetlb page with vmemmap optimization disabled (the warning is not triggered if vmemmap optimization is enabled). I think that we need check folio_test_hugetlb() before dissolve_free_huge_page() calls hugetlb_vmemmap_restore_folio()." Perform the check as suggested by Naoya. Link: https://lkml.kernel.org/r/20231017032140.GA3680@monkey Fixes: d8f5f7e445f0 ("hugetlb: set hugetlb page flag before optimizing vmemmap") Signed-off-by: Mike Kravetz Suggested-by: Naoya Horiguchi Tested-by: Naoya Horiguchi Cc: Anshuman Khandual Cc: Barry Song Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Oscar Salvador Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3fb3a5232173..dfb4534834f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2322,17 +2322,23 @@ retry: * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. + * + * The folio_test_hugetlb check here is because + * remove_hugetlb_folio will clear hugetlb folio flag for + * non-vmemmap optimized hugetlb folios. */ - rc = hugetlb_vmemmap_restore(h, &folio->page); - if (!rc) { - update_and_free_hugetlb_folio(h, folio, false); - } else { - spin_lock_irq(&hugetlb_lock); - add_hugetlb_folio(h, folio, false); - h->max_huge_pages++; - spin_unlock_irq(&hugetlb_lock); - } + if (folio_test_hugetlb(folio)) { + rc = hugetlb_vmemmap_restore(h, &folio->page); + if (rc) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, false); + h->max_huge_pages++; + goto out; + } + } else + rc = 0; + update_and_free_hugetlb_folio(h, folio, false); return rc; } out: -- cgit v1.2.3 From b7c67206594a56be2407ae4da54a114c90609e53 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 22 Sep 2023 10:53:28 -0700 Subject: mm/memcg: annotate struct mem_cgroup_threshold_ary with __counted_by Prepare for the coming implementation by GCC and Clang of the __counted_by attribute. Flexible array members annotated with __counted_by can have their accesses bounds-checked at run-time checking via CONFIG_UBSAN_BOUNDS (for array indexing) and CONFIG_FORTIFY_SOURCE (for strcpy/memcpy-family functions). As found with Coccinelle[1], add __counted_by for struct mem_cgroup_threshold_ary. [1] https://github.com/kees/kernel-tools/blob/trunk/coccinelle/examples/counted_by.cocci Link: https://lkml.kernel.org/r/20230922175327.work.985-kees@kernel.org Signed-off-by: Kees Cook Acked-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Gustavo A. R. Silva Acked-by: Michal Hocko Cc: Johannes Weiner Cc: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 031102ac9311..c6029aeaa268 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -143,7 +143,7 @@ struct mem_cgroup_threshold_ary { /* Size of entries[] */ unsigned int size; /* Array of thresholds */ - struct mem_cgroup_threshold entries[]; + struct mem_cgroup_threshold entries[] __counted_by(size); }; struct mem_cgroup_thresholds { -- cgit v1.2.3 From ff841a06c844b0556b434d67cfc43f4fda56ae7b Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 22 Sep 2023 17:57:39 +0000 Subject: mm: memcg: refactor page state unit helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: memcg: fix tracking of pending stats updates values", v2. While working on adjacent code [1], I realized that the values passed into memcg_rstat_updated() to keep track of the magnitude of pending updates is consistent. It is mostly in pages, but sometimes it can be in bytes or KBs. Fix that. Patch 1 reworks memcg_page_state_unit() so that we can reuse it in patch 2 to check and normalize the units of state updates. [1]https://lore.kernel.org/lkml/20230921081057.3440885-1-yosryahmed@google.com/ This patch (of 2): memcg_page_state_unit() is currently used to identify the unit of a memcg state item so that all stats in memory.stat are in bytes. However, it lies about the units of WORKINGSET_* stats. These stats actually represent pages, but we present them to userspace as a scalar number of events. In retrospect, maybe those stats should have been memcg "events" rather than memcg "state". In preparation for using memcg_page_state_unit() for other purposes that need to know the truthful units of different stat items, break it down into two helpers: - memcg_page_state_unit() retuns the actual unit of the item. - memcg_page_state_output_unit() returns the unit used for output. Use the latter instead of the former in memcg_page_state_output() and lruvec_page_state_output(). While we are at it, let's show cgroup v1 some love and add memcg_page_state_local_output() for consistency. No functional change intended. Link: https://lkml.kernel.org/r/20230922175741.635002-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230922175741.635002-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8713bc796c77..7e65d8c2e685 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1535,7 +1535,7 @@ static const struct memory_stat memory_stats[] = { { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, }; -/* Translate stat items to the correct unit for memory.stat output */ +/* The actual unit of the state item, not the same as the output unit */ static int memcg_page_state_unit(int item) { switch (item) { @@ -1543,6 +1543,22 @@ static int memcg_page_state_unit(int item) case MEMCG_ZSWAP_B: case NR_SLAB_RECLAIMABLE_B: case NR_SLAB_UNRECLAIMABLE_B: + return 1; + case NR_KERNEL_STACK_KB: + return SZ_1K; + default: + return PAGE_SIZE; + } +} + +/* Translate stat items to the correct unit for memory.stat output */ +static int memcg_page_state_output_unit(int item) +{ + /* + * Workingset state is actually in pages, but we export it to userspace + * as a scalar count of events, so special case it here. + */ + switch (item) { case WORKINGSET_REFAULT_ANON: case WORKINGSET_REFAULT_FILE: case WORKINGSET_ACTIVATE_ANON: @@ -1551,17 +1567,23 @@ static int memcg_page_state_unit(int item) case WORKINGSET_RESTORE_FILE: case WORKINGSET_NODERECLAIM: return 1; - case NR_KERNEL_STACK_KB: - return SZ_1K; default: - return PAGE_SIZE; + return memcg_page_state_unit(item); } } static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, int item) { - return memcg_page_state(memcg, item) * memcg_page_state_unit(item); + return memcg_page_state(memcg, item) * + memcg_page_state_output_unit(item); +} + +static inline unsigned long memcg_page_state_local_output( + struct mem_cgroup *memcg, int item) +{ + return memcg_page_state_local(memcg, item) * + memcg_page_state_output_unit(item); } static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) @@ -4113,9 +4135,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], - nr * memcg_page_state_unit(memcg1_stats[i])); + nr = memcg_page_state_local_output(memcg, memcg1_stats[i]); + seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4141,9 +4162,9 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long nr; - nr = memcg_page_state(memcg, memcg1_stats[i]); + nr = memcg_page_state_output(memcg, memcg1_stats[i]); seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * memcg_page_state_unit(memcg1_stats[i])); + (u64)nr); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -6625,7 +6646,8 @@ static int memory_stat_show(struct seq_file *m, void *v) static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, int item) { - return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); + return lruvec_page_state(lruvec, item) * + memcg_page_state_output_unit(item); } static int memory_numa_stat_show(struct seq_file *m, void *v) -- cgit v1.2.3 From 7bd5bc3ce9632aefd0eed33a19212a2e55c0f873 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Fri, 22 Sep 2023 17:57:40 +0000 Subject: mm: memcg: normalize the value passed into memcg_rstat_updated() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit memcg_rstat_updated() uses the value of the state update to keep track of the magnitude of pending updates, so that we only do a stats flush when it's worth the work. Most values passed into memcg_rstat_updated() are in pages, however, a few of them are actually in bytes or KBs. To put this into perspective, a 512 byte slab allocation today would look the same as allocating 512 pages. This may result in premature flushes, which means unnecessary work and latency. Normalize all the state values passed into memcg_rstat_updated() to pages. Round up non-zero sub-page to 1 page, because memcg_rstat_updated() ignores 0 page updates. Link: https://lkml.kernel.org/r/20230922175741.635002-3-yosryahmed@google.com Fixes: 5b3be698a872 ("memcg: better bounds on the memcg stats updates") Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Michal Koutný Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e65d8c2e685..8539f2037168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -763,6 +763,22 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) return x; } +static int memcg_page_state_unit(int item); + +/* + * Normalize the value passed into memcg_rstat_updated() to be in pages. Round + * up non-zero sub-page updates to 1 page as zero page updates are ignored. + */ +static int memcg_state_val_in_pages(int idx, int val) +{ + int unit = memcg_page_state_unit(idx); + + if (!val || unit == PAGE_SIZE) + return val; + else + return max(val * unit / PAGE_SIZE, 1UL); +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -775,7 +791,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) return; __this_cpu_add(memcg->vmstats_percpu->state[idx], val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -826,7 +842,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); memcg_stats_unlock(); } -- cgit v1.2.3 From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 35 ++++++++++++++++++++++++++++ fs/userfaultfd.c | 26 +++++++++++++++++---- include/linux/userfaultfd_k.h | 21 ++++++++++++++++- include/uapi/linux/userfaultfd.h | 9 ++++++- mm/hugetlb.c | 32 ++++++++++++++----------- mm/memory.c | 28 +++++++++++++++++++--- 6 files changed, 129 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 4349a8c2b978..203e26da5f92 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -244,6 +244,41 @@ write-protected (so future writes will also result in a WP fault). These ioctls support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` respectively) to configure the mapping this way. +If the userfaultfd context has ``UFFD_FEATURE_WP_ASYNC`` feature bit set, +any vma registered with write-protection will work in async mode rather +than the default sync mode. + +In async mode, there will be no message generated when a write operation +happens, meanwhile the write-protection will be resolved automatically by +the kernel. It can be seen as a more accurate version of soft-dirty +tracking and it can be different in a few ways: + + - The dirty result will not be affected by vma changes (e.g. vma + merging) because the dirty is only tracked by the pte. + + - It supports range operations by default, so one can enable tracking on + any range of memory as long as page aligned. + + - Dirty information will not get lost if the pte was zapped due to + various reasons (e.g. during split of a shmem transparent huge page). + + - Due to a reverted meaning of soft-dirty (page clean when uffd-wp bit + set; dirty when uffd-wp bit cleared), it has different semantics on + some of the memory operations. For example: ``MADV_DONTNEED`` on + anonymous (or ``MADV_REMOVE`` on a file mapping) will be treated as + dirtying of memory by dropping uffd-wp bit during the procedure. + +The user app can collect the "written/dirty" status by looking up the +uffd-wp bit for the pages being interested in /proc/pagemap. + +The page will not be under track of uffd-wp async mode until the page is +explicitly write-protected by ``ioctl(UFFDIO_WRITEPROTECT)`` with the mode +flag ``UFFDIO_WRITEPROTECT_MODE_WP`` set. Trying to resolve a page fault +that was tracked by async mode userfaultfd-wp is invalid. + +When userfaultfd-wp async mode is used alone, it can be applied to all +kinds of memory. + Memory Poisioning Emulation --------------------------- diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..a7c6ef764e63 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + /* * Whether WP_UNPOPULATED is enabled on the uffd context. It is only * meaningful when userfaultfd_wp()==true on the vma and when it's @@ -1325,6 +1330,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1399,7 +1405,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur, vm_flags)) + if (!vma_can_userfault(cur, vm_flags, wp_async)) goto out_unlock; /* @@ -1460,7 +1466,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags)); + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1561,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; ret = -EFAULT; @@ -1615,7 +1622,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur, cur->vm_flags)) + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) goto out_unlock; found = true; @@ -1631,7 +1638,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); /* * Nothing to do: this vma is already registered into this @@ -2018,6 +2025,11 @@ out: return ret; } +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2051,6 +2063,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR @@ -2063,6 +2080,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..c98df391bfd8 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) } static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) + unsigned long vm_flags, + bool wp_async) { + vm_flags &= __VM_UFFD_FLAGS; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for @@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; #endif + + /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } @@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); +extern bool userfaultfd_wp_async(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ @@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a3..0dbc81015018 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,7 +40,8 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -216,6 +217,11 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +238,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfb4534834f5..bc654b36df9f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; + if (!userfaultfd_wp_async(vma)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; - spin_unlock(ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); + spin_unlock(ptl); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); } - hugetlb_vma_unlock_read(vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, VM_UFFD_WP); + + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(mm, haddr, ptep, entry); + /* Fallthrough to CoW */ } /* diff --git a/mm/memory.c b/mm/memory.c index 2d209bc4d18c..d0f61d729fb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; + pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (!userfaultfd_wp_async(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); + + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + /* + * Update this to be prepared for following up CoW + * handling + */ + vmf->orig_pte = pte; } /* @@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) + goto split; return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); } @@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } } +split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- cgit v1.2.3 From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:14 +0500 Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally clear the info about page table entries. The following operations are supported in this IOCTL: - Scan the address range and get the memory ranges matching the provided criteria. This is performed when the output buffer is specified. - Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be used with or without PM_SCAN_CHECK_WPASYNC. - Both of those operations can be combined into one atomic operation where we can get and write protect the pages as well. Following flags about pages are currently supported: - PAGE_IS_WPALLOWED - Page has async-write-protection enabled - PAGE_IS_WRITTEN - Page has been written to from the time it was write protected - PAGE_IS_FILE - Page is file backed - PAGE_IS_PRESENT - Page is present in the memory - PAGE_IS_SWAPPED - Page is in swapped - PAGE_IS_PFNZERO - Page has zero PFN - PAGE_IS_HUGE - Page is THP or Hugetlb backed This IOCTL can be extended to get information about more PTE bits. The entire address range passed by user [start, end) is scanned until either the user provided buffer is full or max_pages have been found. [akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"] [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning] [arnd@arndb.de: hide unused pagemap_scan_backout_range() function] Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org [sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"] Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Michał Mirosław Signed-off-by: Arnd Bergmann Signed-off-by: Stephen Rothwell Reviewed-by: Andrei Vagin Reviewed-by: Michał Mirosław Cc: Alex Sierra Cc: Al Viro Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 692 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 1 + include/linux/userfaultfd_k.h | 7 + include/uapi/linux/fs.h | 59 ++++ mm/hugetlb.c | 5 +- 5 files changed, 762 insertions(+), 2 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..d4ef9a2bf95d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories = 0; + + if (pte_present(pte)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pte_to_swp_entry(pte); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } else { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pmd_pfn(pmd))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pmd(pmd)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pmd_to_swp_entry(pmd); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + unsigned long psize; + + if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + return; + + psize = huge_page_size(hstate_vma(vma)); + + if (is_hugetlb_entry_migration(ptent)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else if (!huge_pte_none(ptent)) + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); + else + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + + if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma)) + vma_category |= PAGE_IS_WPALLOWED; + else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + arch_enter_lazy_mmu_mode(); + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) { + arch_leave_lazy_mmu_mode(); + return ret; + } + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + arch_leave_lazy_mmu_mode(); + walk->action = ACTION_AGAIN; + return 0; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptep_get(pte)); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + + if (flush_end) + flush_tlb_range(vma, start, addr); + + pte_unmap_unlock(start_pte, ptl); + arch_leave_lazy_mmu_mode(); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + arg->vec_len * sizeof(struct page_region))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct mmu_notifier_range range; + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, p.arg.start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e5b9f7e62eeb..205469aa0613 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); +bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c98df391bfd8..f2dc19f40d05 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, return VM_FAULT_SIGBUS; } +static inline long uffd_wp_range(struct vm_area_struct *vma, + unsigned long start, unsigned long len, + bool enable_wp) +{ + return false; +} + static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c..da43810b7485 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* Pagemap ioctl */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + __u64 start; + __u64 end; + __u64 categories; +}; + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + __u64 size; + __u64 flags; + __u64 start; + __u64 end; + __u64 walk_end; + __u64 vec; + __u64 vec_len; + __u64 max_pages; + __u64 category_inverted; + __u64 category_mask; + __u64 category_anyof_mask; + __u64 return_mask; +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc654b36df9f..2878e0e6bac5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry); + set_huge_pte_at(mm, haddr, ptep, entry, + huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } -- cgit v1.2.3 From 12f6b01a0bcbeeab8cc9305673314adb3adf80f7 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:15 +0500 Subject: fs/proc/task_mmu: add fast paths to get/clear PAGE_IS_WRITTEN flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding fast code paths to handle specifically only get and/or clear operation of PAGE_IS_WRITTEN, increases its performance by 0-35%. The results of some test cases are given below: Test-case-1 t1 = (Get + WP) time t2 = WP time t1 t2 Without this patch: 140-170mcs 90-115mcs With this patch: 110mcs 80mcs Worst case diff: 35% faster 30% faster Test-case-2 t3 = atomic Get and WP t3 Without this patch: 120-140mcs With this patch: 100-110mcs Worst case diff: 21% faster Link: https://lkml.kernel.org/r/20230821141518.870589-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Michał Mirosław Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d4ef9a2bf95d..1d994505805b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2145,6 +2145,41 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } + if (!p->vec_out) { + /* Fast path for performing exclusive WP */ + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + if (pte_uffd_wp(ptep_get(pte))) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = addr + PAGE_SIZE; + } + goto flush_and_return; + } + + if (!p->arg.category_anyof_mask && !p->arg.category_inverted && + p->arg.category_mask == PAGE_IS_WRITTEN && + p->arg.return_mask == PAGE_IS_WRITTEN) { + for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { + unsigned long next = addr + PAGE_SIZE; + + if (pte_uffd_wp(ptep_get(pte))) + continue; + ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, + p, addr, &next); + if (next == addr) + break; + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + goto flush_and_return; + } + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptep_get(pte)); @@ -2168,6 +2203,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, flush_end = next; } +flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); -- cgit v1.2.3 From b58aa0f4fee61040bdb7557bf66822e929342ac5 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:16 +0500 Subject: tools headers UAPI: update linux/fs.h with the kernel sources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New IOCTL and macros has been added in the kernel sources. Update the tools header file as well. Link: https://lkml.kernel.org/r/20230821141518.870589-5-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Michał Mirosław Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- tools/include/uapi/linux/fs.h | 59 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h index b7b56871029c..da43810b7485 100644 --- a/tools/include/uapi/linux/fs.h +++ b/tools/include/uapi/linux/fs.h @@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* Pagemap ioctl */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + __u64 start; + __u64 end; + __u64 categories; +}; + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + __u64 size; + __u64 flags; + __u64 start; + __u64 end; + __u64 walk_end; + __u64 vec; + __u64 vec_len; + __u64 max_pages; + __u64 category_inverted; + __u64 category_mask; + __u64 category_anyof_mask; + __u64 return_mask; +}; + #endif /* _UAPI_LINUX_FS_H */ -- cgit v1.2.3 From 18825b8ae9a3d8abd869811aa2a4ea617da5a59e Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:17 +0500 Subject: mm/pagemap: add documentation of PAGEMAP_SCAN IOCTL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add some explanation and method to use write-protection and written-to on memory range. Link: https://lkml.kernel.org/r/20230821141518.870589-6-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Michał Mirosław Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/pagemap.rst | 89 ++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index c8f380271cad..fe17cf210426 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -227,3 +227,92 @@ Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is always 12 at most architectures). Since Linux 3.11 their meaning changes after first clear of soft-dirty bits. Since Linux 4.2 they are used for flags unconditionally. + +Pagemap Scan IOCTL +================== + +The ``PAGEMAP_SCAN`` IOCTL on the pagemap file can be used to get or optionally +clear the info about page table entries. The following operations are supported +in this IOCTL: + +- Scan the address range and get the memory ranges matching the provided criteria. + This is performed when the output buffer is specified. +- Write-protect the pages. The ``PM_SCAN_WP_MATCHING`` is used to write-protect + the pages of interest. The ``PM_SCAN_CHECK_WPASYNC`` aborts the operation if + non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be + used with or without ``PM_SCAN_CHECK_WPASYNC``. +- Both of those operations can be combined into one atomic operation where we can + get and write protect the pages as well. + +Following flags about pages are currently supported: + +- ``PAGE_IS_WPALLOWED`` - Page has async-write-protection enabled +- ``PAGE_IS_WRITTEN`` - Page has been written to from the time it was write protected +- ``PAGE_IS_FILE`` - Page is file backed +- ``PAGE_IS_PRESENT`` - Page is present in the memory +- ``PAGE_IS_SWAPPED`` - Page is in swapped +- ``PAGE_IS_PFNZERO`` - Page has zero PFN +- ``PAGE_IS_HUGE`` - Page is THP or Hugetlb backed + +The ``struct pm_scan_arg`` is used as the argument of the IOCTL. + + 1. The size of the ``struct pm_scan_arg`` must be specified in the ``size`` + field. This field will be helpful in recognizing the structure if extensions + are done later. + 2. The flags can be specified in the ``flags`` field. The ``PM_SCAN_WP_MATCHING`` + and ``PM_SCAN_CHECK_WPASYNC`` are the only added flags at this time. The get + operation is optionally performed depending upon if the output buffer is + provided or not. + 3. The range is specified through ``start`` and ``end``. + 4. The walk can abort before visiting the complete range such as the user buffer + can get full etc. The walk ending address is specified in``end_walk``. + 5. The output buffer of ``struct page_region`` array and size is specified in + ``vec`` and ``vec_len``. + 6. The optional maximum requested pages are specified in the ``max_pages``. + 7. The masks are specified in ``category_mask``, ``category_anyof_mask``, + ``category_inverted`` and ``return_mask``. + +Find pages which have been written and WP them as well:: + + struct pm_scan_arg arg = { + .size = sizeof(arg), + .flags = PM_SCAN_CHECK_WPASYNC | PM_SCAN_CHECK_WPASYNC, + .. + .category_mask = PAGE_IS_WRITTEN, + .return_mask = PAGE_IS_WRITTEN, + }; + +Find pages which have been written, are file backed, not swapped and either +present or huge:: + + struct pm_scan_arg arg = { + .size = sizeof(arg), + .flags = 0, + .. + .category_mask = PAGE_IS_WRITTEN | PAGE_IS_SWAPPED, + .category_inverted = PAGE_IS_SWAPPED, + .category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_HUGE, + .return_mask = PAGE_IS_WRITTEN | PAGE_IS_SWAPPED | + PAGE_IS_PRESENT | PAGE_IS_HUGE, + }; + +The ``PAGE_IS_WRITTEN`` flag can be considered as a better-performing alternative +of soft-dirty flag. It doesn't get affected by VMA merging of the kernel and hence +the user can find the true soft-dirty pages in case of normal pages. (There may +still be extra dirty pages reported for THP or Hugetlb pages.) + +"PAGE_IS_WRITTEN" category is used with uffd write protect-enabled ranges to +implement memory dirty tracking in userspace: + + 1. The userfaultfd file descriptor is created with ``userfaultfd`` syscall. + 2. The ``UFFD_FEATURE_WP_UNPOPULATED`` and ``UFFD_FEATURE_WP_ASYNC`` features + are set by ``UFFDIO_API`` IOCTL. + 3. The memory range is registered with ``UFFDIO_REGISTER_MODE_WP`` mode + through ``UFFDIO_REGISTER`` IOCTL. + 4. Then any part of the registered memory or the whole memory region must + be write protected using ``PAGEMAP_SCAN`` IOCTL with flag ``PM_SCAN_WP_MATCHING`` + or the ``UFFDIO_WRITEPROTECT`` IOCTL can be used. Both of these perform the + same operation. The former is better in terms of performance. + 5. Now the ``PAGEMAP_SCAN`` IOCTL can be used to either just find pages which + have been written to since they were last marked and/or optionally write protect + the pages as well. -- cgit v1.2.3 From 46fd75d4a3c94dfa5dfcf113881c0c704036b2b2 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:18 +0500 Subject: selftests: mm: add pagemap ioctl tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pagemap ioctl tests. Add several different types of tests to judge the correction of the interface. Link: https://lkml.kernel.org/r/20230821141518.870589-7-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Michał Mirosław Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 2 + tools/testing/selftests/mm/Makefile | 3 +- tools/testing/selftests/mm/config | 1 + tools/testing/selftests/mm/pagemap_ioctl.c | 1660 ++++++++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 4 + 5 files changed, 1669 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/mm/pagemap_ioctl.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index cdc9ce4426b9..cc920c79ff1c 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -18,6 +18,8 @@ mremap_dontunmap mremap_test on-fault-limit transhuge-stress +pagemap_ioctl +*.tmp* protection_keys protection_keys_32 protection_keys_64 diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 6a9fc5693145..2a89989afafc 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -33,7 +33,7 @@ endif MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) -LDLIBS = -lrt -lpthread +LDLIBS = -lrt -lpthread -lm TEST_GEN_FILES = cow TEST_GEN_FILES += compaction_test @@ -60,6 +60,7 @@ TEST_GEN_FILES += mrelease_test TEST_GEN_FILES += mremap_dontunmap TEST_GEN_FILES += mremap_test TEST_GEN_FILES += on-fault-limit +TEST_GEN_PROGS += pagemap_ioctl TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += uffd-stress diff --git a/tools/testing/selftests/mm/config b/tools/testing/selftests/mm/config index be087c4bc396..4309916f629e 100644 --- a/tools/testing/selftests/mm/config +++ b/tools/testing/selftests/mm/config @@ -1,5 +1,6 @@ CONFIG_SYSVIPC=y CONFIG_USERFAULTFD=y +CONFIG_PTE_MARKER_UFFD_WP=y CONFIG_TEST_VMALLOC=m CONFIG_DEVICE_PRIVATE=y CONFIG_TEST_HMM=m diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c new file mode 100644 index 000000000000..0161fb49fc6e --- /dev/null +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -0,0 +1,1660 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include "vm_util.h" +#include "../kselftest.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGEMAP_BITS_ALL (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PAGEMAP_NON_WRITTEN_BITS (PAGE_IS_WPALLOWED | PAGE_IS_FILE | \ + PAGE_IS_PRESENT | PAGE_IS_SWAPPED | \ + PAGE_IS_PFNZERO | PAGE_IS_HUGE) + +#define TEST_ITERATIONS 100 +#define PAGEMAP "/proc/self/pagemap" +int pagemap_fd; +int uffd; +int page_size; +int hpage_size; + +#define LEN(region) ((region.end - region.start)/page_size) + +static long pagemap_ioctl(void *start, int len, void *vec, int vec_len, int flag, + int max_pages, long required_mask, long anyof_mask, long excluded_mask, + long return_mask) +{ + struct pm_scan_arg arg; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + len); + arg.vec = (uintptr_t)vec; + arg.vec_len = vec_len; + arg.flags = flag; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = max_pages; + arg.category_mask = required_mask; + arg.category_anyof_mask = anyof_mask; + arg.category_inverted = excluded_mask; + arg.return_mask = return_mask; + + return ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); +} + +static long pagemap_ioc(void *start, int len, void *vec, int vec_len, int flag, + int max_pages, long required_mask, long anyof_mask, long excluded_mask, + long return_mask, long *walk_end) +{ + struct pm_scan_arg arg; + int ret; + + arg.start = (uintptr_t)start; + arg.end = (uintptr_t)(start + len); + arg.vec = (uintptr_t)vec; + arg.vec_len = vec_len; + arg.flags = flag; + arg.size = sizeof(struct pm_scan_arg); + arg.max_pages = max_pages; + arg.category_mask = required_mask; + arg.category_anyof_mask = anyof_mask; + arg.category_inverted = excluded_mask; + arg.return_mask = return_mask; + + ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); + + if (walk_end) + *walk_end = arg.walk_end; + + return ret; +} + + +int init_uffd(void) +{ + struct uffdio_api uffdio_api; + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); + if (uffd == -1) + ksft_exit_fail_msg("uffd syscall failed\n"); + + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_WP_UNPOPULATED | UFFD_FEATURE_WP_ASYNC | + UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + ksft_exit_fail_msg("UFFDIO_API\n"); + + if (!(uffdio_api.api & UFFDIO_REGISTER_MODE_WP) || + !(uffdio_api.features & UFFD_FEATURE_WP_UNPOPULATED) || + !(uffdio_api.features & UFFD_FEATURE_WP_ASYNC) || + !(uffdio_api.features & UFFD_FEATURE_WP_HUGETLBFS_SHMEM)) + ksft_exit_fail_msg("UFFDIO_API error %llu\n", uffdio_api.api); + + return 0; +} + +int wp_init(void *lpBaseAddress, int dwRegionSize) +{ + struct uffdio_register uffdio_register; + struct uffdio_writeprotect wp; + + uffdio_register.range.start = (unsigned long)lpBaseAddress; + uffdio_register.range.len = dwRegionSize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + ksft_exit_fail_msg("ioctl(UFFDIO_REGISTER) %d %s\n", errno, strerror(errno)); + + if (!(uffdio_register.ioctls & UFFDIO_WRITEPROTECT)) + ksft_exit_fail_msg("ioctl set is incorrect\n"); + + wp.range.start = (unsigned long)lpBaseAddress; + wp.range.len = dwRegionSize; + wp.mode = UFFDIO_WRITEPROTECT_MODE_WP; + + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &wp)) + ksft_exit_fail_msg("ioctl(UFFDIO_WRITEPROTECT)\n"); + + return 0; +} + +int wp_free(void *lpBaseAddress, int dwRegionSize) +{ + struct uffdio_register uffdio_register; + + uffdio_register.range.start = (unsigned long)lpBaseAddress; + uffdio_register.range.len = dwRegionSize; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + ksft_exit_fail_msg("ioctl unregister failure\n"); + return 0; +} + +int wp_addr_range(void *lpBaseAddress, int dwRegionSize) +{ + if (pagemap_ioctl(lpBaseAddress, dwRegionSize, NULL, 0, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0) + ksft_exit_fail_msg("error %d %d %s\n", 1, errno, strerror(errno)); + + return 0; +} + +void *gethugetlb_mem(int size, int *shmid) +{ + char *mem; + + if (shmid) { + *shmid = shmget(2, size, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (*shmid < 0) + return NULL; + + mem = shmat(*shmid, 0, 0); + if (mem == (char *)-1) { + shmctl(*shmid, IPC_RMID, NULL); + ksft_exit_fail_msg("Shared memory attach failure\n"); + } + } else { + mem = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_HUGETLB | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + return NULL; + } + + return mem; +} + +int userfaultfd_tests(void) +{ + int mem_size, vec_size, written, num_pages = 16; + char *mem, *vec; + + mem_size = num_pages * page_size; + mem = mmap(NULL, mem_size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + + /* Change protection of pages differently */ + mprotect(mem, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 1 * mem_size/8, mem_size/8, PROT_READ); + mprotect(mem + 2 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 3 * mem_size/8, mem_size/8, PROT_READ); + mprotect(mem + 4 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 5 * mem_size/8, mem_size/8, PROT_NONE); + mprotect(mem + 6 * mem_size/8, mem_size/8, PROT_READ|PROT_WRITE); + mprotect(mem + 7 * mem_size/8, mem_size/8, PROT_READ); + + wp_addr_range(mem + (mem_size/16), mem_size - 2 * (mem_size/8)); + wp_addr_range(mem, mem_size); + + vec_size = mem_size/page_size; + vec = malloc(sizeof(struct page_region) * vec_size); + + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + free(vec); + return 0; +} + +int get_reads(struct page_region *vec, int vec_size) +{ + int i, sum = 0; + + for (i = 0; i < vec_size; i++) + sum += LEN(vec[i]); + + return sum; +} + +int sanity_tests_sd(void) +{ + int mem_size, vec_size, ret, ret2, ret3, i, num_pages = 1000, total_pages = 0; + int total_writes, total_reads, reads, count; + struct page_region *vec, *vec2; + char *mem, *m[2]; + long walk_end; + + vec_size = num_pages/2; + mem_size = num_pages * page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + vec2 = malloc(sizeof(struct page_region) * vec_size); + if (!vec2) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + /* 1. wrong operation */ + ksft_test_result(pagemap_ioctl(mem, 0, vec, vec_size, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s Zero range size is valid\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, NULL, vec_size, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) < 0, + "%s output buffer must be specified with size\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s output buffer can be 0\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, 0, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) == 0, + "%s output buffer can be 0\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, -1, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0, + "%s wrong flag specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC | 0xFF, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) < 0, + "%s flag has extra bits specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, 0, 0, 0, PAGE_IS_WRITTEN) >= 0, + "%s no selection mask is specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, PAGE_IS_WRITTEN, 0, 0) == 0, + "%s no return mask is specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, 0x1000) < 0, + "%s wrong return mask specified\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, 0xFFF, PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN) < 0, + "%s mixture of correct and wrong flag\n", __func__); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, 0, 0, PAGEMAP_BITS_ALL, PAGE_IS_WRITTEN) >= 0, + "%s PAGEMAP_BITS_ALL can be specified with PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", + __func__); + + /* 2. Clear area with larger vec size */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + ksft_test_result(ret >= 0, "%s Clear area with larger vec size\n", __func__); + + /* 3. Repeated pattern of written and non-written pages */ + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, 0, + 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == mem_size/(page_size * 2), + "%s Repeated pattern of written and non-written pages\n", __func__); + + /* 4. Repeated pattern of written and non-written pages in parts */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + num_pages/2 - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec, 2, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ret3 = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret3 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret3, errno, strerror(errno)); + + ksft_test_result((ret + ret3) == num_pages/2 && ret2 == 2, + "%s Repeated pattern of written and non-written pages in parts %d %d %d\n", + __func__, ret, ret3, ret2); + + /* 5. Repeated pattern of written and non-written pages max_pages */ + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + mem[(mem_size/page_size - 1) * page_size]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + num_pages/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret == num_pages/2 && ret2 == 1, + "%s Repeated pattern of written and non-written pages max_pages\n", + __func__); + + /* 6. only get 2 dirty pages and clear them as well */ + vec_size = mem_size/page_size; + memset(mem, -1, mem_size); + + /* get and clear second and third pages */ + ret = pagemap_ioctl(mem + page_size, 2 * page_size, vec, 1, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret2 = pagemap_ioctl(mem, mem_size, vec2, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == 2 && + vec[0].start == (uintptr_t)(mem + page_size) && + ret2 == 2 && LEN(vec2[0]) == 1 && vec2[0].start == (uintptr_t)mem && + LEN(vec2[1]) == vec_size - 3 && + vec2[1].start == (uintptr_t)(mem + 3 * page_size), + "%s only get 2 written pages and clear them as well\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 7. Two regions */ + m[0] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (m[0] == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + m[1] = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (m[1] == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(m[0], mem_size); + wp_init(m[1], mem_size); + wp_addr_range(m[0], mem_size); + wp_addr_range(m[1], mem_size); + + memset(m[0], 'a', mem_size); + memset(m[1], 'b', mem_size); + + wp_addr_range(m[0], mem_size); + + ret = pagemap_ioctl(m[1], mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == mem_size/page_size, + "%s Two regions\n", __func__); + + wp_free(m[0], mem_size); + wp_free(m[1], mem_size); + munmap(m[0], mem_size); + munmap(m[1], mem_size); + + free(vec); + free(vec2); + + /* 8. Smaller vec */ + mem_size = 1050 * page_size; + vec_size = mem_size/(page_size*2); + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + for (i = 0; i < mem_size/page_size; i += 2) + mem[i * page_size]++; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + mem_size/(page_size*5), PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + total_pages += ret; + + ksft_test_result(total_pages == mem_size/(page_size*2), "%s Smaller max_pages\n", __func__); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + total_pages = 0; + + /* 9. Smaller vec */ + mem_size = 10000 * page_size; + vec_size = 50; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + for (count = 0; count < TEST_ITERATIONS; count++) { + total_writes = total_reads = 0; + walk_end = (long)mem; + + for (i = 0; i < mem_size; i += page_size) { + if (rand() % 2) { + mem[i]++; + total_writes++; + } + } + + while (total_reads < total_writes) { + ret = pagemap_ioc((void *)walk_end, mem_size-(walk_end - (long)mem), vec, + vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + if (ret > vec_size) + break; + + reads = get_reads(vec, ret); + total_reads += reads; + } + + if (total_reads != total_writes) + break; + } + + ksft_test_result(count == TEST_ITERATIONS, "Smaller vec\n"); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 10. Walk_end tester */ + vec_size = 1000; + mem_size = vec_size * page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + if (!vec) + ksft_exit_fail_msg("error nomem\n"); + + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioc(mem, 0, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end address\n"); + + ret = pagemap_ioc(mem, 0, vec, vec_size, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end with WP\n"); + + ret = pagemap_ioc(mem, 0, vec, 0, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 0 && walk_end == (long)mem, + "Walk_end: Same start and end with 0 output buffer\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: Big vec\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: vec of minimum length\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size/2), + "Walk_end: Half max pages\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size), + "Walk_end: 1 max page\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + -1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + mem_size), + "Walk_end: max pages\n"); + + wp_addr_range(mem, mem_size); + for (i = 0; i < mem_size; i += 2 * page_size) + mem[i]++; + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Big vec\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end sparse: vec of minimum length\n"); + + ret = pagemap_ioc(mem, mem_size, vec, 1, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size/2, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_end sparse: Max pages specified\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == vec_size/2 && walk_end == (long)(mem + mem_size), + "Walk_endsparse : Half max pages\n"); + + ret = pagemap_ioc(mem, mem_size, vec, vec_size, 0, + 1, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN, &walk_end); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_test_result(ret == 1 && walk_end == (long)(mem + page_size * 2), + "Walk_end: 1 max page\n"); + + free(vec); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + return 0; +} + +int base_tests(char *prefix, char *mem, int mem_size, int skip) +{ + int vec_size, written; + struct page_region *vec, *vec2; + + if (skip) { + ksft_test_result_skip("%s all new pages must not be written (dirty)\n", prefix); + ksft_test_result_skip("%s all pages must be written (dirty)\n", prefix); + ksft_test_result_skip("%s all pages dirty other than first and the last one\n", + prefix); + ksft_test_result_skip("%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix); + ksft_test_result_skip("%s only middle page dirty\n", prefix); + ksft_test_result_skip("%s only two middle pages dirty\n", prefix); + return 0; + } + + vec_size = mem_size/page_size; + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + + /* 1. all new pages must be not be written (dirty) */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", prefix); + + /* 2. all pages must be written */ + memset(mem, -1, mem_size); + + written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0, PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) == mem_size/page_size, + "%s all pages must be written (dirty)\n", prefix); + + /* 3. all pages dirty other than first and the last one */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + memset(mem + page_size, 0, mem_size - (2 * page_size)); + + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) >= vec_size - 2 && LEN(vec[0]) <= vec_size, + "%s all pages dirty other than first and the last one\n", prefix); + + written = pagemap_ioctl(mem, mem_size, vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 0, + "%s PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC\n", prefix); + + /* 4. only middle page dirty */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + mem[vec_size/2 * page_size]++; + + written = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && LEN(vec[0]) >= 1, + "%s only middle page dirty\n", prefix); + + /* 5. only two middle pages dirty and walk over only middle pages */ + written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + mem[vec_size/2 * page_size]++; + mem[(vec_size/2 + 1) * page_size]++; + + written = pagemap_ioctl(&mem[vec_size/2 * page_size], 2 * page_size, vec, 1, 0, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN | PAGE_IS_HUGE); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written == 1 && vec[0].start == (uintptr_t)(&mem[vec_size/2 * page_size]) + && LEN(vec[0]) == 2, + "%s only two middle pages dirty\n", prefix); + + free(vec); + free(vec2); + return 0; +} + +void *gethugepage(int map_size) +{ + int ret; + char *map; + + map = memalign(hpage_size, map_size); + if (!map) + ksft_exit_fail_msg("memalign failed %d %s\n", errno, strerror(errno)); + + ret = madvise(map, map_size, MADV_HUGEPAGE); + if (ret) + return NULL; + + memset(map, 0, map_size); + + return map; +} + +int hpage_unit_tests(void) +{ + char *map; + int ret, ret2; + size_t num_pages = 10; + int map_size = hpage_size * num_pages; + int vec_size = map_size/page_size; + struct page_region *vec, *vec2; + + vec = malloc(sizeof(struct page_region) * vec_size); + vec2 = malloc(sizeof(struct page_region) * vec_size); + if (!vec || !vec2) + ksft_exit_fail_msg("malloc failed\n"); + + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + /* 1. all new huge page must not be written (dirty) */ + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 0, "%s all new huge page must not be written (dirty)\n", + __func__); + + /* 2. all the huge page must not be written */ + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 0, "%s all the huge page must not be written\n", __func__); + + /* 3. all the huge page must be written and clear dirty as well */ + memset(map, -1, map_size); + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && vec[0].start == (uintptr_t)map && + LEN(vec[0]) == vec_size && vec[0].categories == PAGE_IS_WRITTEN, + "%s all the huge page must be written and clear\n", __func__); + + /* 4. only middle page written */ + wp_free(map, map_size); + free(map); + map = gethugepage(map_size); + wp_init(map, map_size); + wp_addr_range(map, map_size); + map[vec_size/2 * page_size]++; + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) > 0, + "%s only middle page written\n", __func__); + + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s all new huge page must be written\n", __func__); + ksft_test_result_skip("%s all the huge page must not be written\n", __func__); + ksft_test_result_skip("%s all the huge page must be written and clear\n", __func__); + ksft_test_result_skip("%s only middle page written\n", __func__); + } + + /* 5. clear first half of huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, 0, map_size); + + wp_addr_range(map, map_size/2); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page\n", __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear first half of huge page\n", __func__); + } + + /* 6. clear first half of huge page with limited buffer */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, 0, map_size); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + vec_size/2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2 && + vec[0].start == (uintptr_t)(map + map_size/2), + "%s clear first half of huge page with limited buffer\n", + __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear first half of huge page with limited buffer\n", + __func__); + } + + /* 7. clear second half of huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, -1, map_size); + + ret = pagemap_ioctl(map + map_size/2, map_size/2, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size/2, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ret = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == vec_size/2, + "%s clear second half huge page\n", __func__); + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s clear second half huge page\n", __func__); + } + + /* 8. get half huge page */ + map = gethugepage(map_size); + if (map) { + wp_init(map, map_size); + wp_addr_range(map, map_size); + + memset(map, -1, map_size); + usleep(100); + + ret = pagemap_ioctl(map, map_size, vec, 1, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + hpage_size/(2*page_size), PAGE_IS_WRITTEN, 0, 0, + PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec[0]) == hpage_size/(2*page_size), + "%s get half huge page\n", __func__); + + ret2 = pagemap_ioctl(map, map_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); + if (ret2 < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret2, errno, strerror(errno)); + + ksft_test_result(ret2 == 1 && LEN(vec[0]) == (map_size - hpage_size/2)/page_size, + "%s get half huge page\n", __func__); + + wp_free(map, map_size); + free(map); + } else { + ksft_test_result_skip("%s get half huge page\n", __func__); + ksft_test_result_skip("%s get half huge page\n", __func__); + } + + free(vec); + free(vec2); + return 0; +} + +int unmapped_region_tests(void) +{ + void *start = (void *)0x10000000; + int written, len = 0x00040000; + int vec_size = len / page_size; + struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); + + /* 1. Get written pages */ + written = pagemap_ioctl(start, len, vec, vec_size, 0, 0, + PAGEMAP_NON_WRITTEN_BITS, 0, 0, PAGEMAP_NON_WRITTEN_BITS); + if (written < 0) + ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + + ksft_test_result(written >= 0, "%s Get status of pages\n", __func__); + + free(vec); + return 0; +} + +static void test_simple(void) +{ + int i; + char *map; + struct page_region vec; + + map = aligned_alloc(page_size, page_size); + if (!map) + ksft_exit_fail_msg("aligned_alloc failed\n"); + + wp_init(map, page_size); + wp_addr_range(map, page_size); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 1) { + ksft_print_msg("written bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + wp_addr_range(map, page_size); + /* Write something to the page to get the written bit enabled on the page */ + map[0]++; + + if (pagemap_ioctl(map, page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0) { + ksft_print_msg("written bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + wp_addr_range(map, page_size); + } + wp_free(map, page_size); + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +int sanity_tests(void) +{ + int mem_size, vec_size, ret, fd, i, buf_size; + struct page_region *vec; + char *mem, *fmem; + struct stat sbuf; + char *tmp_buf; + + /* 1. wrong operation */ + mem_size = 10 * page_size; + vec_size = mem_size / page_size; + + vec = malloc(sizeof(struct page_region) * vec_size); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED || vec == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, + 0, PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0, + "%s WP op can be specified with !PAGE_IS_WRITTEN\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL) >= 0, + "%s required_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL) >= 0, + "%s anyof_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, 0, PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL) >= 0, + "%s excluded_mask specified\n", __func__); + ksft_test_result(pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, PAGEMAP_BITS_ALL, 0, + PAGEMAP_BITS_ALL) >= 0, + "%s required_mask and anyof_mask specified\n", __func__); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 2. Get sd and present pages with anyof_mask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get sd and present pages with anyof_mask\n", __func__); + + /* 3. Get sd and present pages with required_mask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGEMAP_BITS_ALL, 0, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get all the pages with required_mask\n", __func__); + + /* 4. Get sd and present pages with required_mask and anyof_mask */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, PAGE_IS_PRESENT, 0, PAGEMAP_BITS_ALL); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + (vec[0].categories & (PAGE_IS_WRITTEN | PAGE_IS_PRESENT)) == + (PAGE_IS_WRITTEN | PAGE_IS_PRESENT), + "%s Get sd and present pages with required_mask and anyof_mask\n", + __func__); + + /* 5. Don't get sd pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, 0, PAGE_IS_WRITTEN, PAGEMAP_BITS_ALL); + ksft_test_result(ret == 0, "%s Don't get sd pages\n", __func__); + + /* 6. Don't get present pages */ + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, 0, 0, + PAGE_IS_PRESENT, 0, PAGE_IS_PRESENT, PAGEMAP_BITS_ALL); + ksft_test_result(ret == 0, "%s Don't get present pages\n", __func__); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 8. Find written present pages with return mask */ + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + memset(mem, 0, mem_size); + + ret = pagemap_ioctl(mem, mem_size, vec, vec_size, + PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, 0, + 0, PAGEMAP_BITS_ALL, 0, PAGE_IS_WRITTEN); + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)mem && LEN(vec[0]) == vec_size && + vec[0].categories == PAGE_IS_WRITTEN, + "%s Find written present pages with return mask\n", __func__); + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 9. Memory mapped file */ + fd = open(__FILE__, O_RDONLY); + if (fd < 0) + ksft_exit_fail_msg("%s Memory mapped file\n"); + + ret = stat(__FILE__, &sbuf); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + + tmp_buf = malloc(sbuf.st_size); + memcpy(tmp_buf, fmem, sbuf.st_size); + + ret = pagemap_ioctl(fmem, sbuf.st_size, vec, vec_size, 0, 0, + 0, PAGEMAP_NON_WRITTEN_BITS, 0, PAGEMAP_NON_WRITTEN_BITS); + + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && + LEN(vec[0]) == ceilf((float)sbuf.st_size/page_size) && + (vec[0].categories & PAGE_IS_FILE), + "%s Memory mapped file\n", __func__); + + munmap(fmem, sbuf.st_size); + close(fd); + + /* 10. Create and read/write to a memory mapped file */ + buf_size = page_size * 10; + + fd = open(__FILE__".tmp2", O_RDWR | O_CREAT, 0666); + if (fd < 0) + ksft_exit_fail_msg("Read/write to memory: %s\n", + strerror(errno)); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + + wp_init(fmem, buf_size); + wp_addr_range(fmem, buf_size); + + for (i = 0; i < buf_size; i++) + fmem[i] = 'z'; + + msync(fmem, buf_size, MS_SYNC); + + ret = pagemap_ioctl(fmem, buf_size, vec, vec_size, 0, 0, + PAGE_IS_WRITTEN, PAGE_IS_PRESENT | PAGE_IS_SWAPPED | PAGE_IS_FILE, 0, + PAGEMAP_BITS_ALL); + + ksft_test_result(ret >= 0 && vec[0].start == (uintptr_t)fmem && + LEN(vec[0]) == (buf_size/page_size) && + (vec[0].categories & PAGE_IS_WRITTEN), + "%s Read/write to memory\n", __func__); + + wp_free(fmem, buf_size); + munmap(fmem, buf_size); + close(fd); + + free(vec); + return 0; +} + +int mprotect_tests(void) +{ + int ret; + char *mem, *mem2; + struct page_region vec; + int pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + + if (pagemap_fd < 0) { + fprintf(stderr, "open() failed\n"); + exit(1); + } + + /* 1. Map two pages */ + mem = mmap(0, 2 * page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, 2 * page_size); + wp_addr_range(mem, 2 * page_size); + + /* Populate both pages. */ + memset(mem, 1, 2 * page_size); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, "%s Both pages written\n", __func__); + + /* 2. Start tracking */ + wp_addr_range(mem, 2 * page_size); + + ksft_test_result(pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, + PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN) == 0, + "%s Both pages are not written (dirty)\n", __func__); + + /* 3. Remap the second page */ + mem2 = mmap(mem + page_size, page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON|MAP_FIXED, -1, 0); + if (mem2 == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem2, page_size); + wp_addr_range(mem2, page_size); + + /* Protect + unprotect. */ + mprotect(mem, page_size, PROT_NONE); + mprotect(mem, 2 * page_size, PROT_READ); + mprotect(mem, 2 * page_size, PROT_READ|PROT_WRITE); + + /* Modify both pages. */ + memset(mem, 2, 2 * page_size); + + /* Protect + unprotect. */ + mprotect(mem, page_size, PROT_NONE); + mprotect(mem, page_size, PROT_READ); + mprotect(mem, page_size, PROT_READ|PROT_WRITE); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, + "%s Both pages written after remap and mprotect\n", __func__); + + /* 4. Clear and make the pages written */ + wp_addr_range(mem, 2 * page_size); + + memset(mem, 'A', 2 * page_size); + + ret = pagemap_ioctl(mem, 2 * page_size, &vec, 1, 0, 0, PAGE_IS_WRITTEN, + 0, 0, PAGE_IS_WRITTEN); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + ksft_test_result(ret == 1 && LEN(vec) == 2, + "%s Clear and make the pages written\n", __func__); + + wp_free(mem, 2 * page_size); + munmap(mem, 2 * page_size); + return 0; +} + +/* transact test */ +static const unsigned int nthreads = 6, pages_per_thread = 32, access_per_thread = 8; +static pthread_barrier_t start_barrier, end_barrier; +static unsigned int extra_thread_faults; +static unsigned int iter_count = 1000; +static volatile int finish; + +static ssize_t get_dirty_pages_reset(char *mem, unsigned int count, + int reset, int page_size) +{ + struct pm_scan_arg arg = {0}; + struct page_region rgns[256]; + int i, j, cnt, ret; + + arg.size = sizeof(struct pm_scan_arg); + arg.start = (uintptr_t)mem; + arg.max_pages = count; + arg.end = (uintptr_t)(mem + count * page_size); + arg.vec = (uintptr_t)rgns; + arg.vec_len = sizeof(rgns) / sizeof(*rgns); + if (reset) + arg.flags |= PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC; + arg.category_mask = PAGE_IS_WRITTEN; + arg.return_mask = PAGE_IS_WRITTEN; + + ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg); + if (ret < 0) + ksft_exit_fail_msg("ioctl failed\n"); + + cnt = 0; + for (i = 0; i < ret; ++i) { + if (rgns[i].categories != PAGE_IS_WRITTEN) + ksft_exit_fail_msg("wrong flags\n"); + + for (j = 0; j < LEN(rgns[i]); ++j) + cnt++; + } + + return cnt; +} + +void *thread_proc(void *mem) +{ + int *m = mem; + long curr_faults, faults; + struct rusage r; + unsigned int i; + int ret; + + if (getrusage(RUSAGE_THREAD, &r)) + ksft_exit_fail_msg("getrusage\n"); + + curr_faults = r.ru_minflt; + + while (!finish) { + ret = pthread_barrier_wait(&start_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + for (i = 0; i < access_per_thread; ++i) + __atomic_add_fetch(m + i * (0x1000 / sizeof(*m)), 1, __ATOMIC_SEQ_CST); + + ret = pthread_barrier_wait(&end_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + if (getrusage(RUSAGE_THREAD, &r)) + ksft_exit_fail_msg("getrusage\n"); + + faults = r.ru_minflt - curr_faults; + if (faults < access_per_thread) + ksft_exit_fail_msg("faults < access_per_thread"); + + __atomic_add_fetch(&extra_thread_faults, faults - access_per_thread, + __ATOMIC_SEQ_CST); + curr_faults = r.ru_minflt; + } + + return NULL; +} + +static void transact_test(int page_size) +{ + unsigned int i, count, extra_pages; + pthread_t th; + char *mem; + int ret, c; + + if (pthread_barrier_init(&start_barrier, NULL, nthreads + 1)) + ksft_exit_fail_msg("pthread_barrier_init\n"); + + if (pthread_barrier_init(&end_barrier, NULL, nthreads + 1)) + ksft_exit_fail_msg("pthread_barrier_init\n"); + + mem = mmap(NULL, 0x1000 * nthreads * pages_per_thread, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("Error mmap %s.\n", strerror(errno)); + + wp_init(mem, 0x1000 * nthreads * pages_per_thread); + wp_addr_range(mem, 0x1000 * nthreads * pages_per_thread); + + memset(mem, 0, 0x1000 * nthreads * pages_per_thread); + + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + ksft_test_result(count > 0, "%s count %d\n", __func__, count); + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + ksft_test_result(count == 0, "%s count %d\n", __func__, count); + + finish = 0; + for (i = 0; i < nthreads; ++i) + pthread_create(&th, NULL, thread_proc, mem + 0x1000 * i * pages_per_thread); + + extra_pages = 0; + for (i = 0; i < iter_count; ++i) { + count = 0; + + ret = pthread_barrier_wait(&start_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + count = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, + page_size); + + ret = pthread_barrier_wait(&end_barrier); + if (ret && ret != PTHREAD_BARRIER_SERIAL_THREAD) + ksft_exit_fail_msg("pthread_barrier_wait\n"); + + if (count > nthreads * access_per_thread) + ksft_exit_fail_msg("Too big count %d expected %d, iter %d\n", + count, nthreads * access_per_thread, i); + + c = get_dirty_pages_reset(mem, nthreads * pages_per_thread, 1, page_size); + count += c; + + if (c > nthreads * access_per_thread) { + ksft_test_result_fail(" %s count > nthreads\n", __func__); + return; + } + + if (count != nthreads * access_per_thread) { + /* + * The purpose of the test is to make sure that no page updates are lost + * when the page updates and read-resetting soft dirty flags are performed + * in parallel. However, it is possible that the application will get the + * soft dirty flags twice on the two consecutive read-resets. This seems + * unavoidable as soft dirty flag is handled in software through page faults + * in kernel. While the updating the flags is supposed to be synchronized + * between page fault handling and read-reset, it is possible that + * read-reset happens after page fault PTE update but before the application + * re-executes write instruction. So read-reset gets the flag, clears write + * access and application gets page fault again for the same write. + */ + if (count < nthreads * access_per_thread) { + ksft_test_result_fail("Lost update, iter %d, %d vs %d.\n", i, count, + nthreads * access_per_thread); + return; + } + + extra_pages += count - nthreads * access_per_thread; + } + } + + pthread_barrier_wait(&start_barrier); + finish = 1; + pthread_barrier_wait(&end_barrier); + + ksft_test_result_pass("%s Extra pages %u (%.1lf%%), extra thread faults %d.\n", __func__, + extra_pages, + 100.0 * extra_pages / (iter_count * nthreads * access_per_thread), + extra_thread_faults); +} + +int main(void) +{ + int mem_size, shmid, buf_size, fd, i, ret; + char *mem, *map, *fmem; + struct stat sbuf; + + ksft_print_header(); + ksft_set_plan(115); + + page_size = getpagesize(); + hpage_size = read_pmd_pagesize(); + + pagemap_fd = open(PAGEMAP, O_RDONLY); + if (pagemap_fd < 0) + return -EINVAL; + + if (init_uffd()) + ksft_exit_fail_msg("uffd init failed\n"); + + /* 1. Sanity testing */ + sanity_tests_sd(); + + /* 2. Normal page testing */ + mem_size = 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Page testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 3. Large page testing */ + mem_size = 512 * 10 * page_size; + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) + ksft_exit_fail_msg("error nomem\n"); + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Large Page testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + munmap(mem, mem_size); + + /* 4. Huge page testing */ + map = gethugepage(hpage_size); + if (map) { + wp_init(map, hpage_size); + wp_addr_range(map, hpage_size); + base_tests("Huge page testing:", map, hpage_size, 0); + wp_free(map, hpage_size); + free(map); + } else { + base_tests("Huge page testing:", NULL, 0, 1); + } + + /* 5. SHM Hugetlb page testing */ + mem_size = 2*1024*1024; + mem = gethugetlb_mem(mem_size, &shmid); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb shmem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + shmctl(shmid, IPC_RMID, NULL); + } else { + base_tests("Hugetlb shmem testing:", NULL, 0, 1); + } + + /* 6. Hugetlb page testing */ + mem = gethugetlb_mem(mem_size, NULL); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb mem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + } else { + base_tests("Hugetlb mem testing:", NULL, 0, 1); + } + + /* 7. File Hugetlb testing */ + mem_size = 2*1024*1024; + fd = memfd_create("uffd-test", MFD_HUGETLB | MFD_NOEXEC_SEAL); + mem = mmap(NULL, mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (mem) { + wp_init(mem, mem_size); + wp_addr_range(mem, mem_size); + + base_tests("Hugetlb shmem testing:", mem, mem_size, 0); + + wp_free(mem, mem_size); + shmctl(shmid, IPC_RMID, NULL); + } else { + base_tests("Hugetlb shmem testing:", NULL, 0, 1); + } + close(fd); + + /* 8. File memory testing */ + buf_size = page_size * 10; + + fd = open(__FILE__".tmp0", O_RDWR | O_CREAT, 0777); + if (fd < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n", + strerror(errno)); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + ret = stat(__FILE__".tmp0", &sbuf); + if (ret < 0) + ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + + fmem = mmap(NULL, sbuf.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + + wp_init(fmem, sbuf.st_size); + wp_addr_range(fmem, sbuf.st_size); + + base_tests("File memory testing:", fmem, sbuf.st_size, 0); + + wp_free(fmem, sbuf.st_size); + munmap(fmem, sbuf.st_size); + close(fd); + + /* 9. File memory testing */ + buf_size = page_size * 10; + + fd = memfd_create(__FILE__".tmp00", MFD_NOEXEC_SEAL); + if (fd < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file: %s\n", + strerror(errno)); + + if (ftruncate(fd, buf_size)) + ksft_exit_fail_msg("Error ftruncate\n"); + + for (i = 0; i < buf_size; i++) + if (write(fd, "c", 1) < 0) + ksft_exit_fail_msg("Create and read/write to a memory mapped file\n"); + + fmem = mmap(NULL, buf_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (fmem == MAP_FAILED) + ksft_exit_fail_msg("error nomem %ld %s\n", errno, strerror(errno)); + + wp_init(fmem, buf_size); + wp_addr_range(fmem, buf_size); + + base_tests("File anonymous memory testing:", fmem, buf_size, 0); + + wp_free(fmem, buf_size); + munmap(fmem, buf_size); + close(fd); + + /* 10. Huge page tests */ + hpage_unit_tests(); + + /* 11. Iterative test */ + test_simple(); + + /* 12. Mprotect test */ + mprotect_tests(); + + /* 13. Transact test */ + transact_test(page_size); + + /* 14. Sanity testing */ + sanity_tests(); + + /*15. Unmapped address test */ + unmapped_region_tests(); + + /* 16. Userfaultfd tests */ + userfaultfd_tests(); + + close(pagemap_fd); + return ksft_exit_pass(); +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 3e2bc818d566..45941ad5f658 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -56,6 +56,8 @@ separated by spaces: memory protection key tests - soft_dirty test soft dirty page bit semantics +- pagemap + test pagemap_scan IOCTL - cow test copy-on-write semantics - thp @@ -342,6 +344,8 @@ then CATEGORY="soft_dirty" run_test ./soft-dirty fi +CATEGORY="pagemap" run_test ./pagemap_ioctl + # COW tests CATEGORY="cow" run_test ./cow -- cgit v1.2.3 From 7771dcf019dde5998380c40deec0b42f5df9d9a3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Sep 2023 16:13:59 -0400 Subject: radix tree test suite: fix allocation calculation in kmem_cache_alloc_bulk() The bulk allocation is iterating through an array and storing enough memory for the entire bulk allocation instead of a single array entry. Only allocate an array element of the size set in the kmem_cache. Link: https://lkml.kernel.org/r/20230929201359.2857583-1-Liam.Howlett@oracle.com Fixes: cc86e0c2f306 ("radix tree test suite: add support for slab bulk APIs") Signed-off-by: Liam R. Howlett Reported-by: Christophe JAILLET Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- tools/testing/radix-tree/linux.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/radix-tree/linux.c b/tools/testing/radix-tree/linux.c index d587a558997f..61fe2601cb3a 100644 --- a/tools/testing/radix-tree/linux.c +++ b/tools/testing/radix-tree/linux.c @@ -165,9 +165,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *cachep, gfp_t gfp, size_t size, for (i = 0; i < size; i++) { if (cachep->align) { posix_memalign(&p[i], cachep->align, - cachep->size * size); + cachep->size); } else { - p[i] = malloc(cachep->size * size); + p[i] = malloc(cachep->size); } if (cachep->ctor) cachep->ctor(p[i]); -- cgit v1.2.3 From 6facf36ee49675ea952713a7a1488e9f191e7eec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sun, 1 Oct 2023 00:10:29 +0100 Subject: mm/filemap: clarify filemap_fault() comments for not uptodate case The existing comments in filemap_fault() suggest that, after either a minor fault has occurred and filemap_get_folio() found a folio in the page cache, or a major fault arose and __filemap_get_folio(FGP_CREATE...) did the job (having relied on do_sync_mmap_readahead() or filemap_read_folio() to read in the folio), the only possible reason it could not be uptodate is because of an error. This is not so, as if, for instance, the fault occurred within a VMA which had the VM_RAND_READ flag set (via madvise() with the MADV_RANDOM flag specified), this would cause even synchronous readahead to fail to read in the folio. I confirmed this by dropping page caches and faulting in memory madvise()'d this way, observing that this code path was reached on each occasion. Clarify the comments to include this case, and additionally update the comment recently added around the invalidate lock logic to make it clear the comment explicitly refers to the minor fault case. In addition, while we're here, refer to folios rather than pages. [lstoakes@gmail.com: correct identation as per Christopher's feedback] Link: https://lkml.kernel.org/r/2c7014c0-6343-4e76-8697-3f84f54350bd@lucifer.local Link: https://lkml.kernel.org/r/20230930231029.88196-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 340c693112a9..062c25b184a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3307,21 +3307,28 @@ retry_find: VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); /* - * We have a locked page in the page cache, now we need to check - * that it's up-to-date. If not, it is going to be due to an error. + * We have a locked folio in the page cache, now we need to check + * that it's up-to-date. If not, it is going to be due to an error, + * or because readahead was otherwise unable to retrieve it. */ if (unlikely(!folio_test_uptodate(folio))) { /* - * The page was in cache and uptodate and now it is not. - * Strange but possible since we didn't hold the page lock all - * the time. Let's drop everything get the invalidate lock and - * try again. + * If the invalidate lock is not held, the folio was in cache + * and uptodate and now it is not. Strange but possible since we + * didn't hold the page lock all the time. Let's drop + * everything, get the invalidate lock and try again. */ if (!mapping_locked) { folio_unlock(folio); folio_put(folio); goto retry_find; } + + /* + * OK, the folio is really not uptodate. This can be because the + * VMA has the VM_RAND_READ flag set, or because an error + * arose. Let's read it in directly. + */ goto page_not_uptodate; } -- cgit v1.2.3 From ee615d4585cfc305bf6c218a62123c3051f8b4a3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:25:38 -0700 Subject: shmem: shrink shmem_inode_info: dir_offsets in a union Patch series "shmem,tmpfs: general maintenance". Mostly just cosmetic mods in mm/shmem.c, but the last two enforcing the "size=" limit better. 8/8 goes into percpu counter territory, and could stand alone. This patch (of 8): Shave 32 bytes off (the 64-bit) shmem_inode_info. There was a 4-byte pahole after stop_eviction, better filled by fsflags. And the 24-byte dir_offsets can only be used by directories, whereas shrinklist and swaplist only by shmem_mapping() inodes (regular files or long symlinks): so put those into a union. No change in mm/shmem.c is required for this. Link: https://lkml.kernel.org/r/c7441dc6-f3bb-dd60-c670-9f5cbd9f266@google.com Link: https://lkml.kernel.org/r/86ebb4b-c571-b9e8-27f5-cb82ec50357e@google.com Signed-off-by: Hugh Dickins Reviewed-by: Chuck Lever Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Darrick J. Wong Cc: Dave Chinner Cc: Tim Chen Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 6b0c626620f5..2caa6b86106a 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -23,18 +23,22 @@ struct shmem_inode_info { unsigned long flags; unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ - pgoff_t fallocend; /* highest fallocate endindex */ - struct list_head shrinklist; /* shrinkable hpage inodes */ - struct list_head swaplist; /* chain of maybes on swap */ + union { + struct offset_ctx dir_offsets; /* stable directory offsets */ + struct { + struct list_head shrinklist; /* shrinkable hpage inodes */ + struct list_head swaplist; /* chain of maybes on swap */ + }; + }; + struct timespec64 i_crtime; /* file creation time */ struct shared_policy policy; /* NUMA memory alloc policy */ struct simple_xattrs xattrs; /* list of xattrs */ + pgoff_t fallocend; /* highest fallocate endindex */ + unsigned int fsflags; /* for FS_IOC_[SG]ETFLAGS */ atomic_t stop_eviction; /* hold when working on inode */ - struct timespec64 i_crtime; /* file creation time */ - unsigned int fsflags; /* flags for FS_IOC_[SG]ETFLAGS */ #ifdef CONFIG_TMPFS_QUOTA struct dquot *i_dquot[MAXQUOTAS]; #endif - struct offset_ctx dir_offsets; /* stable entry offsets */ struct inode vfs_inode; }; -- cgit v1.2.3 From e3e1a5067fd2f1b3f4f7c651f5b33082962d1aa1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:26:53 -0700 Subject: shmem: remove vma arg from shmem_get_folio_gfp() The vma is already there in vmf->vma, so no need for a separate arg. Link: https://lkml.kernel.org/r/d9ce6f65-a2ed-48f4-4299-fdb0544875c5@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8539d39ec9a3..dfa7e544a99d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1921,14 +1921,13 @@ unlock: * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vma, vmf, and fault_type are only supplied by shmem_fault: - * otherwise they are NULL. + * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL. */ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) + struct vm_fault *vmf, vm_fault_t *fault_type) { + struct vm_area_struct *vma = vmf ? vmf->vma : NULL; struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; @@ -2141,7 +2140,7 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp) { return shmem_get_folio_gfp(inode, index, foliop, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } /* @@ -2225,7 +2224,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) } err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, - gfp, vma, vmf, &ret); + gfp, vmf, &ret); if (err) return vmf_error(err); if (folio) @@ -4893,7 +4892,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping, BUG_ON(!shmem_mapping(mapping)); error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, - gfp, NULL, NULL, NULL); + gfp, NULL, NULL); if (error) return ERR_PTR(error); -- cgit v1.2.3 From f0a9ad1d4d9ba3c694bca91d8d67be9a4a33b902 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:27:53 -0700 Subject: shmem: factor shmem_falloc_wait() out of shmem_fault() That Trinity livelock shmem_falloc avoidance block is unlikely, and a distraction from the proper business of shmem_fault(): separate it out. (This used to help compilers save stack on the fault path too, but both gcc and clang nowadays seem to make better choices anyway.) Link: https://lkml.kernel.org/r/6fe379a4-6176-9225-9263-fe60d2633c0@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 126 +++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index dfa7e544a99d..0aa8889c76f2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2148,87 +2148,99 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, * entry unconditionally - even if something else had already woken the * target. */ -static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) +static int synchronous_wake_function(wait_queue_entry_t *wait, + unsigned int mode, int sync, void *key) { int ret = default_wake_function(wait, mode, sync, key); list_del_init(&wait->entry); return ret; } +/* + * Trinity finds that probing a hole which tmpfs is punching can + * prevent the hole-punch from ever completing: which in turn + * locks writers out with its hold on i_rwsem. So refrain from + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_rwsem in fault, + * and bloating every shmem inode for this unlikely case would be sad. + */ +static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode) +{ + struct shmem_falloc *shmem_falloc; + struct file *fpin = NULL; + vm_fault_t ret = 0; + + spin_lock(&inode->i_lock); + shmem_falloc = inode->i_private; + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); + + ret = VM_FAULT_NOPAGE; + fpin = maybe_unlock_mmap_for_io(vmf, NULL); + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + } + spin_unlock(&inode->i_lock); + if (fpin) { + fput(fpin); + ret = VM_FAULT_RETRY; + } + return ret; +} + static vm_fault_t shmem_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = file_inode(vma->vm_file); + struct inode *inode = file_inode(vmf->vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); struct folio *folio = NULL; + vm_fault_t ret = 0; int err; - vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can - * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_rwsem. So refrain from - * faulting pages into the hole while it's being punched. Although - * shmem_undo_range() does remove the additions, it may be unable to - * keep up, as each new page needs its own unmap_mapping_range() call, - * and the i_mmap tree grows ever slower to scan if new vmas are added. - * - * It does not matter if we sometimes reach this check just before the - * hole-punch begins, so that one fault then races with the punch: - * we just need to make racing faults a rare case. - * - * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_rwsem in fault, - * and bloating every shmem inode for this unlikely case would be sad. + * prevent the hole-punch from ever completing: noted in i_private. */ if (unlikely(inode->i_private)) { - struct shmem_falloc *shmem_falloc; - - spin_lock(&inode->i_lock); - shmem_falloc = inode->i_private; - if (shmem_falloc && - shmem_falloc->waitq && - vmf->pgoff >= shmem_falloc->start && - vmf->pgoff < shmem_falloc->next) { - struct file *fpin; - wait_queue_head_t *shmem_falloc_waitq; - DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function); - - ret = VM_FAULT_NOPAGE; - fpin = maybe_unlock_mmap_for_io(vmf, NULL); - if (fpin) - ret = VM_FAULT_RETRY; - - shmem_falloc_waitq = shmem_falloc->waitq; - prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, - TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - schedule(); - - /* - * shmem_falloc_waitq points into the shmem_fallocate() - * stack of the hole-punching task: shmem_falloc_waitq - * is usually invalid by the time we reach here, but - * finish_wait() does not dereference it in that case; - * though i_lock needed lest racing with wake_up_all(). - */ - spin_lock(&inode->i_lock); - finish_wait(shmem_falloc_waitq, &shmem_fault_wait); - spin_unlock(&inode->i_lock); - - if (fpin) - fput(fpin); + ret = shmem_falloc_wait(vmf, inode); + if (ret) return ret; - } - spin_unlock(&inode->i_lock); } + WARN_ON_ONCE(vmf->page != NULL); err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vmf, &ret); if (err) return vmf_error(err); - if (folio) + if (folio) { vmf->page = folio_file_page(folio, vmf->pgoff); + ret |= VM_FAULT_LOCKED; + } return ret; } -- cgit v1.2.3 From 9be7d5b06648b808989e99c5d0bea1be47c5a384 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:28:50 -0700 Subject: shmem: trivial tidyups, removing extra blank lines, etc Mostly removing a few superfluous blank lines, joining short arglines, imposing some 80-column observance, correcting a couple of comments. None of it more interesting than deleting a repeated INIT_LIST_HEAD(). Link: https://lkml.kernel.org/r/b3983d28-5d3f-8649-36af-b819285d7a9e@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 56 +++++++++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0aa8889c76f2..3540fc598848 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -756,7 +756,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Like filemap_add_folio, but error if expected item has gone. + * Somewhat like filemap_add_folio, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, @@ -825,7 +825,7 @@ error: } /* - * Like delete_from_page_cache, but substitutes swap for @folio. + * Somewhat like filemap_remove_folio, but substitutes swap for @folio. */ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { @@ -887,7 +887,6 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, cond_resched_rcu(); } } - rcu_read_unlock(); return swapped << PAGE_SHIFT; @@ -1213,7 +1212,6 @@ static int shmem_setattr(struct mnt_idmap *idmap, if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { error = dquot_transfer(idmap, inode, attr); - if (error) return error; } @@ -2456,7 +2454,6 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, if (err) return ERR_PTR(err); - inode = new_inode(sb); if (!inode) { shmem_free_inode(sb, 0); @@ -2481,11 +2478,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); - INIT_LIST_HEAD(&info->swaplist); - if (sbinfo->noswap) - mapping_set_unevictable(inode->i_mapping); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); + if (sbinfo->noswap) + mapping_set_unevictable(inode->i_mapping); mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { @@ -2697,7 +2693,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, } ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); - if (ret) return ret; @@ -3229,8 +3224,7 @@ shmem_mknod(struct mnt_idmap *idmap, struct inode *dir, error = simple_acl_create(dir, inode); if (error) goto out_iput; - error = security_inode_init_security(inode, dir, - &dentry->d_name, + error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3259,14 +3253,11 @@ shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir, int error; inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE); - if (IS_ERR(inode)) { error = PTR_ERR(inode); goto err_out; } - - error = security_inode_init_security(inode, dir, - NULL, + error = security_inode_init_security(inode, dir, NULL, shmem_initxattrs, NULL); if (error && error != -EOPNOTSUPP) goto out_iput; @@ -3303,7 +3294,8 @@ static int shmem_create(struct mnt_idmap *idmap, struct inode *dir, /* * Link a file.. */ -static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +static int shmem_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); int ret = 0; @@ -3334,7 +3326,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ - dget(dentry); /* Extra pinning count for the created dentry */ + dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: return ret; @@ -3354,7 +3346,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) inode_set_ctime_current(inode)); inode_inc_iversion(dir); drop_nlink(inode); - dput(dentry); /* Undo the count from "create" - this does all the work */ + dput(dentry); /* Undo the count from "create" - does all the work */ return 0; } @@ -3464,7 +3456,6 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir, inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0, VM_NORESERVE); - if (IS_ERR(inode)) return PTR_ERR(inode); @@ -3518,8 +3509,7 @@ static void shmem_put_link(void *arg) folio_put(arg); } -static const char *shmem_get_link(struct dentry *dentry, - struct inode *inode, +static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct folio *folio = NULL; @@ -3593,8 +3583,7 @@ static int shmem_fileattr_set(struct mnt_idmap *idmap, * Callback for security_inode_init_security() for acquiring xattrs. */ static int shmem_initxattrs(struct inode *inode, - const struct xattr *xattr_array, - void *fs_info) + const struct xattr *xattr_array, void *fs_info) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -3778,7 +3767,6 @@ static struct dentry *shmem_find_alias(struct inode *inode) return alias ?: d_find_any_alias(inode); } - static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -4362,8 +4350,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) } #endif /* CONFIG_TMPFS_QUOTA */ - inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, S_IFDIR | sbinfo->mode, 0, - VM_NORESERVE); + inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL, + S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto failed; @@ -4662,11 +4650,9 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, for (i = 0; i < ARRAY_SIZE(values); i++) { len += sysfs_emit_at(buf, len, - shmem_huge == values[i] ? "%s[%s]" : "%s%s", - i ? " " : "", - shmem_format_huge(values[i])); + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", shmem_format_huge(values[i])); } - len += sysfs_emit_at(buf, len, "\n"); return len; @@ -4763,8 +4749,9 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block *sb, struct inode *dir, - umode_t mode, dev_t dev, unsigned long flags) +static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, + struct super_block *sb, struct inode *dir, + umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode = ramfs_get_inode(sb, dir, mode, dev); return inode ? inode : ERR_PTR(-ENOSPC); @@ -4774,8 +4761,8 @@ static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct supe /* common code */ -static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size, - unsigned long flags, unsigned int i_flags) +static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, + loff_t size, unsigned long flags, unsigned int i_flags) { struct inode *inode; struct file *res; @@ -4794,7 +4781,6 @@ static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, l inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); - if (IS_ERR(inode)) { shmem_unacct_size(flags, size); return ERR_CAST(inode); -- cgit v1.2.3 From 4199f51a7eb2054d68964efbd8d39c68053a8714 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:30:03 -0700 Subject: shmem: shmem_acct_blocks() and shmem_inode_acct_blocks() By historical accident, shmem_acct_block() and shmem_inode_acct_block() were never pluralized when the pages argument was added, despite their complements being shmem_unacct_blocks() and shmem_inode_unacct_blocks() all along. It has been an irritation: fix their naming at last. Link: https://lkml.kernel.org/r/9124094-e4ab-8be7-ef80-9a87bdc2e4fc@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3540fc598848..fabce18ed9e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -189,10 +189,10 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ -static inline int shmem_acct_block(unsigned long flags, long pages) +static inline int shmem_acct_blocks(unsigned long flags, long pages) { if (!(flags & VM_NORESERVE)) return 0; @@ -207,13 +207,13 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages) vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE)); } -static int shmem_inode_acct_block(struct inode *inode, long pages) +static int shmem_inode_acct_blocks(struct inode *inode, long pages) { struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int err = -ENOSPC; - if (shmem_acct_block(info->flags, pages)) + if (shmem_acct_blocks(info->flags, pages)) return err; might_sleep(); /* when quotas */ @@ -447,7 +447,7 @@ bool shmem_charge(struct inode *inode, long pages) { struct address_space *mapping = inode->i_mapping; - if (shmem_inode_acct_block(inode, pages)) + if (shmem_inode_acct_blocks(inode, pages)) return false; /* nrpages adjustment first, then shmem_recalc_inode() when balanced */ @@ -1671,7 +1671,7 @@ static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, huge = false; nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_block(inode, nr); + err = shmem_inode_acct_blocks(inode, nr); if (err) goto failed; @@ -2572,7 +2572,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, int ret; pgoff_t max_off; - if (shmem_inode_acct_block(inode, 1)) { + if (shmem_inode_acct_blocks(inode, 1)) { /* * We may have got a page, returned -ENOENT triggering a retry, * and now we find ourselves with -ENOMEM. Release the page, to -- cgit v1.2.3 From 054a9f7ccd0a60607fb9bbe1e06ca671494971bf Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:31:27 -0700 Subject: shmem: move memcg charge out of shmem_add_to_page_cache() Extract shmem's memcg charging out of shmem_add_to_page_cache(): it's misleading done there, because many calls are dealing with a swapcache page, whose memcg is nowadays always remembered while swapped out, then the charge re-levied when it's brought back into swapcache. Temporarily move it back up to the shmem_get_folio_gfp() level, where the memcg was charged before v5.8; but the next commit goes on to move it back down to a new home. In making this change, it becomes clear that shmem_swapin_folio() does not need to know the vma, just the fault mm (if any): call it fault_mm rather than charge_mm - let mem_cgroup_charge() decide whom to charge. Link: https://lkml.kernel.org/r/4b2143c5-bf32-64f0-841-81a81158dac@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 68 +++++++++++++++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index fabce18ed9e5..670f77de3238 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -146,9 +146,8 @@ static unsigned long shmem_default_max_inodes(void) #endif static int shmem_swapin_folio(struct inode *inode, pgoff_t index, - struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - vm_fault_t *fault_type); + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, vm_fault_t *fault_type); static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -760,12 +759,10 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, */ static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp, - struct mm_struct *charge_mm) + pgoff_t index, void *expected, gfp_t gfp) { XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); long nr = folio_nr_pages(folio); - int error; VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -776,16 +773,7 @@ static int shmem_add_to_page_cache(struct folio *folio, folio->mapping = mapping; folio->index = index; - if (!folio_test_swapcache(folio)) { - error = mem_cgroup_charge(folio, charge_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto error; - } - } + gfp &= GFP_RECLAIM_MASK; folio_throttle_swaprate(folio, gfp); do { @@ -813,15 +801,12 @@ unlock: } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - error = xas_error(&xas); - goto error; + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return xas_error(&xas); } return 0; -error: - folio->mapping = NULL; - folio_ref_sub(folio, nr); - return error; } /* @@ -1324,10 +1309,8 @@ static int shmem_unuse_swap_entries(struct inode *inode, if (!xa_is_value(folio)) continue; - error = shmem_swapin_folio(inode, indices[i], - &folio, SGP_CACHE, - mapping_gfp_mask(mapping), - NULL, NULL); + error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE, + mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { folio_unlock(folio); folio_put(folio); @@ -1810,12 +1793,11 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, */ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, + gfp_t gfp, struct mm_struct *fault_mm, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; struct swap_info_struct *si; struct folio *folio = NULL; swp_entry_t swap; @@ -1843,7 +1825,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, if (fault_type) { *fault_type |= VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); - count_memcg_event_mm(charge_mm, PGMAJFAULT); + count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ folio = shmem_swapin(swap, gfp, info, index); @@ -1880,8 +1862,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, } error = shmem_add_to_page_cache(folio, mapping, index, - swp_to_radix_entry(swap), gfp, - charge_mm); + swp_to_radix_entry(swap), gfp); if (error) goto failed; @@ -1929,7 +1910,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct mm_struct *charge_mm; + struct mm_struct *fault_mm; struct folio *folio; pgoff_t hindex; gfp_t huge_gfp; @@ -1946,7 +1927,7 @@ repeat: } sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : NULL; + fault_mm = vma ? vma->vm_mm : NULL; folio = filemap_get_entry(mapping, index); if (folio && vma && userfaultfd_minor(vma)) { @@ -1958,7 +1939,7 @@ repeat: if (xa_is_value(folio)) { error = shmem_swapin_folio(inode, index, &folio, - sgp, gfp, vma, fault_type); + sgp, gfp, fault_mm, fault_type); if (error == -EEXIST) goto repeat; @@ -2044,9 +2025,16 @@ alloc_nohuge: if (sgp == SGP_WRITE) __folio_set_referenced(folio); - error = shmem_add_to_page_cache(folio, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK, - charge_mm); + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unacct; + } + + error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); if (error) goto unacct; @@ -2644,8 +2632,10 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (unlikely(pgoff >= max_off)) goto out_release; - ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm); + ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp); + if (ret) + goto out_release; + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp); if (ret) goto out_release; -- cgit v1.2.3 From 3022fd7af9604d44ec43da8a4398872989599b18 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:32:40 -0700 Subject: shmem: _add_to_page_cache() before shmem_inode_acct_blocks() There has been a recurring problem, that when a tmpfs volume is being filled by racing threads, some fail with ENOSPC (or consequent SIGBUS or EFAULT) even though all allocations were within the permitted size. This was a problem since early days, but magnified and complicated by the addition of huge pages. We have often worked around it by adding some slop to the tmpfs size, but it's hard to say how much is needed, and some users prefer not to do that e.g. keeping sparse files in a tightly tailored tmpfs helps to prevent accidental writing to holes. This comes from the allocation sequence: 1. check page cache for existing folio 2. check and reserve from vm_enough_memory 3. check and account from size of tmpfs 4. if huge, check page cache for overlapping folio 5. allocate physical folio, huge or small 6. check and charge from mem cgroup limit 7. add to page cache (but maybe another folio already got in). Concurrent tasks allocating at the same position could deplete the size allowance and fail. Doing vm_enough_memory and size checks before the folio allocation was intentional (to limit the load on the page allocator from this source) and still has some virtue; but memory cgroup never did that, so I think it's better reordered to favour predictable behaviour. 1. check page cache for existing folio 2. if huge, check page cache for overlapping folio 3. allocate physical folio, huge or small 4. check and charge from mem cgroup limit 5. add to page cache (but maybe another folio already got in) 6. check and reserve from vm_enough_memory 7. check and account from size of tmpfs. The folio lock held from allocation onwards ensures that the !uptodate folio cannot be used by others, and can safely be deleted from the cache if checks 6 or 7 subsequently fail (and those waiting on folio lock already check that the folio was not truncated once they get the lock); and the early addition to page cache ensures that racers find it before they try to duplicate the accounting. Seize the opportunity to tidy up shmem_get_folio_gfp()'s ENOSPC retrying, which can be combined inside the new shmem_alloc_and_add_folio(): doing 2 splits twice (once huge, once nonhuge) is not exactly equivalent to trying 5 splits (and giving up early on huge), but let's keep it simple unless more complication proves necessary. Userfaultfd is a foreign country: they do things differently there, and for good reason - to avoid mmap_lock deadlock. Leave ordering in shmem_mfill_atomic_pte() untouched for now, but I would rather like to mesh it better with shmem_get_folio_gfp() in the future. Link: https://lkml.kernel.org/r/22ddd06-d919-33b-1219-56335c1bf28e@google.com Signed-off-by: Hugh Dickins Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- mm/shmem.c | 229 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 118 insertions(+), 111 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 670f77de3238..269cd3c1110f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -789,13 +789,11 @@ static int shmem_add_to_page_cache(struct folio *folio, xas_store(&xas, folio); if (xas_error(&xas)) goto unlock; - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_ALLOC); + if (folio_test_pmd_mappable(folio)) __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); - } - mapping->nrpages += nr; __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); + mapping->nrpages += nr; unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -1612,25 +1610,17 @@ static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct address_space *mapping = info->vfs_inode.i_mapping; - pgoff_t hindex; struct folio *folio; - hindex = round_down(index, HPAGE_PMD_NR); - if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, - XA_PRESENT)) - return NULL; - - shmem_pseudo_vma_init(&pvma, info, hindex); + shmem_pseudo_vma_init(&pvma, info, index); folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); - if (!folio) - count_vm_event(THP_FILE_FALLBACK); + return folio; } static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct folio *folio; @@ -1642,36 +1632,101 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) +static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, + struct inode *inode, pgoff_t index, + struct mm_struct *fault_mm, bool huge) { + struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct folio *folio; - int nr; - int err; + long pages; + int error; if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; - nr = huge ? HPAGE_PMD_NR : 1; - err = shmem_inode_acct_blocks(inode, nr); - if (err) - goto failed; + if (huge) { + pages = HPAGE_PMD_NR; + index = round_down(index, HPAGE_PMD_NR); + + /* + * Check for conflict before waiting on a huge allocation. + * Conflict might be that a huge page has just been allocated + * and added to page cache by a racing thread, or that there + * is already at least one small page in the huge extent. + * Be careful to retry when appropriate, but not forever! + * Elsewhere -EEXIST would be the right code, but not here. + */ + if (xa_find(&mapping->i_pages, &index, + index + HPAGE_PMD_NR - 1, XA_PRESENT)) + return ERR_PTR(-E2BIG); - if (huge) folio = shmem_alloc_hugefolio(gfp, info, index); - else + if (!folio) + count_vm_event(THP_FILE_FALLBACK); + } else { + pages = 1; folio = shmem_alloc_folio(gfp, info, index); - if (folio) { - __folio_set_locked(folio); - __folio_set_swapbacked(folio); - return folio; + } + if (!folio) + return ERR_PTR(-ENOMEM); + + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + + gfp &= GFP_RECLAIM_MASK; + error = mem_cgroup_charge(folio, fault_mm, gfp); + if (error) { + if (xa_find(&mapping->i_pages, &index, + index + pages - 1, XA_PRESENT)) { + error = -EEXIST; + } else if (huge) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto unlock; } - err = -ENOMEM; - shmem_inode_unacct_blocks(inode, nr); -failed: - return ERR_PTR(err); + error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp); + if (error) + goto unlock; + + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + long freed; + /* + * Try to reclaim some space by splitting a few + * large folios beyond i_size on the filesystem. + */ + shmem_unused_huge_shrink(sbinfo, NULL, 2); + /* + * And do a shmem_recalc_inode() to account for freed pages: + * except our folio is there in cache, so not quite balanced. + */ + spin_lock(&info->lock); + freed = pages + info->alloced - info->swapped - + READ_ONCE(mapping->nrpages); + if (freed > 0) + info->alloced -= freed; + spin_unlock(&info->lock); + if (freed > 0) + shmem_inode_unacct_blocks(inode, freed); + error = shmem_inode_acct_blocks(inode, pages); + if (error) { + filemap_remove_folio(folio); + goto unlock; + } + } + + shmem_recalc_inode(inode, pages, 0); + folio_add_lru(folio); + return folio; + +unlock: + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(error); } /* @@ -1907,29 +1962,22 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct vm_fault *vmf, vm_fault_t *fault_type) { struct vm_area_struct *vma = vmf ? vmf->vma : NULL; - struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo; struct mm_struct *fault_mm; struct folio *folio; - pgoff_t hindex; - gfp_t huge_gfp; int error; - int once = 0; - int alloced = 0; + bool alloced; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; repeat: if (sgp <= SGP_CACHE && - ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { + ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) return -EINVAL; - } - sbinfo = SHMEM_SB(inode->i_sb); + alloced = false; fault_mm = vma ? vma->vm_mm : NULL; - folio = filemap_get_entry(mapping, index); + folio = filemap_get_entry(inode->i_mapping, index); if (folio && vma && userfaultfd_minor(vma)) { if (!xa_is_value(folio)) folio_put(folio); @@ -1951,7 +1999,7 @@ repeat: folio_lock(folio); /* Has the folio been truncated or swapped out? */ - if (unlikely(folio->mapping != mapping)) { + if (unlikely(folio->mapping != inode->i_mapping)) { folio_unlock(folio); folio_put(folio); goto repeat; @@ -1986,65 +2034,38 @@ repeat: return 0; } - if (!shmem_is_huge(inode, index, false, - vma ? vma->vm_mm : NULL, vma ? vma->vm_flags : 0)) - goto alloc_nohuge; + if (shmem_is_huge(inode, index, false, fault_mm, + vma ? vma->vm_flags : 0)) { + gfp_t huge_gfp; - huge_gfp = vma_thp_gfp_mask(vma); - huge_gfp = limit_gfp_mask(huge_gfp, gfp); - folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); - if (IS_ERR(folio)) { -alloc_nohuge: - folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_add_folio(huge_gfp, + inode, index, fault_mm, true); + if (!IS_ERR(folio)) { + count_vm_event(THP_FILE_ALLOC); + goto alloced; + } + if (PTR_ERR(folio) == -EEXIST) + goto repeat; } - if (IS_ERR(folio)) { - int retry = 5; + folio = shmem_alloc_and_add_folio(gfp, inode, index, fault_mm, false); + if (IS_ERR(folio)) { error = PTR_ERR(folio); + if (error == -EEXIST) + goto repeat; folio = NULL; - if (error != -ENOSPC) - goto unlock; - /* - * Try to reclaim some space by splitting a large folio - * beyond i_size on the filesystem. - */ - while (retry--) { - int ret; - - ret = shmem_unused_huge_shrink(sbinfo, NULL, 1); - if (ret == SHRINK_STOP) - break; - if (ret) - goto alloc_nohuge; - } goto unlock; } - hindex = round_down(index, folio_nr_pages(folio)); - - if (sgp == SGP_WRITE) - __folio_set_referenced(folio); - - error = mem_cgroup_charge(folio, fault_mm, gfp); - if (error) { - if (folio_test_pmd_mappable(folio)) { - count_vm_event(THP_FILE_FALLBACK); - count_vm_event(THP_FILE_FALLBACK_CHARGE); - } - goto unacct; - } - - error = shmem_add_to_page_cache(folio, mapping, hindex, NULL, gfp); - if (error) - goto unacct; - - folio_add_lru(folio); - shmem_recalc_inode(inode, folio_nr_pages(folio), 0); +alloced: alloced = true; - if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < folio_next_index(folio) - 1) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + struct shmem_inode_info *info = SHMEM_I(inode); /* * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. @@ -2062,6 +2083,8 @@ alloc_nohuge: spin_unlock(&sbinfo->shrinklist_lock); } + if (sgp == SGP_WRITE) + folio_set_referenced(folio); /* * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ @@ -2085,11 +2108,6 @@ clear: /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { - if (alloced) { - folio_clear_dirty(folio); - filemap_remove_folio(folio); - shmem_recalc_inode(inode, 0, 0); - } error = -EINVAL; goto unlock; } @@ -2100,25 +2118,14 @@ out: /* * Error recovery. */ -unacct: - shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - - if (folio_test_large(folio)) { - folio_unlock(folio); - folio_put(folio); - goto alloc_nohuge; - } unlock: + if (alloced) + filemap_remove_folio(folio); + shmem_recalc_inode(inode, 0, 0); if (folio) { folio_unlock(folio); folio_put(folio); } - if (error == -ENOSPC && !once++) { - shmem_recalc_inode(inode, 0, 0); - goto repeat; - } - if (error == -EEXIST) - goto repeat; return error; } -- cgit v1.2.3 From beb9868628445306958fd7b2da1cd369a4a381cc Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Sep 2023 20:42:45 -0700 Subject: shmem,percpu_counter: add _limited_add(fbc, limit, amount) Percpu counter's compare and add are separate functions: without locking around them (which would defeat their purpose), it has been possible to overflow the intended limit. Imagine all the other CPUs fallocating tmpfs huge pages to the limit, in between this CPU's compare and its add. I have not seen reports of that happening; but tmpfs's recent addition of dquot_alloc_block_nodirty() in between the compare and the add makes it even more likely, and I'd be uncomfortable to leave it unfixed. Introduce percpu_counter_limited_add(fbc, limit, amount) to prevent it. I believe this implementation is correct, and slightly more efficient than the combination of compare and add (taking the lock once rather than twice when nearing full - the last 128MiB of a tmpfs volume on a machine with 128 CPUs and 4KiB pages); but it does beg for a better design - when nearing full, there is no new batching, but the costly percpu counter sum across CPUs still has to be done, while locked. Follow __percpu_counter_sum()'s example, including cpu_dying_mask as well as cpu_online_mask: but shouldn't __percpu_counter_compare() and __percpu_counter_limited_add() then be adding a num_dying_cpus() to num_online_cpus(), when they calculate the maximum which could be held across CPUs? But the times when it matters would be vanishingly rare. Link: https://lkml.kernel.org/r/bb817848-2d19-bcc8-39ca-ea179af0f0b4@google.com Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Tim Chen Cc: Dave Chinner Cc: Darrick J. Wong Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 23 ++++++++++++++++++ lib/percpu_counter.c | 53 ++++++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 10 ++++---- 3 files changed, 81 insertions(+), 5 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d01351b1526f..8cb7c071bd5c 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -57,6 +57,8 @@ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); +bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, + s64 amount, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) @@ -69,6 +71,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ + return __percpu_counter_limited_add(fbc, limit, amount, + percpu_counter_batch); +} + /* * With percpu_counter_add_local() and percpu_counter_sub_local(), counts * are accumulated in local per cpu counter and not in fbc->count until @@ -185,6 +194,20 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount) local_irq_restore(flags); } +static inline bool +percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) +{ + unsigned long flags; + s64 count; + + local_irq_save(flags); + count = fbc->count + amount; + if (count <= limit) + fbc->count = count; + local_irq_restore(flags); + return count <= limit; +} + /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ static inline void percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 9073430dc865..58a3392f471b 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -278,6 +278,59 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) } EXPORT_SYMBOL(__percpu_counter_compare); +/* + * Compare counter, and add amount if the total is within limit. + * Return true if amount was added, false if it would exceed limit. + */ +bool __percpu_counter_limited_add(struct percpu_counter *fbc, + s64 limit, s64 amount, s32 batch) +{ + s64 count; + s64 unknown; + unsigned long flags; + bool good; + + if (amount > limit) + return false; + + local_irq_save(flags); + unknown = batch * num_online_cpus(); + count = __this_cpu_read(*fbc->counters); + + /* Skip taking the lock when safe */ + if (abs(count + amount) <= batch && + fbc->count + unknown <= limit) { + this_cpu_add(*fbc->counters, amount); + local_irq_restore(flags); + return true; + } + + raw_spin_lock(&fbc->lock); + count = fbc->count + amount; + + /* Skip percpu_counter_sum() when safe */ + if (count + unknown > limit) { + s32 *pcount; + int cpu; + + for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) { + pcount = per_cpu_ptr(fbc->counters, cpu); + count += *pcount; + } + } + + good = count <= limit; + if (good) { + count = __this_cpu_read(*fbc->counters); + fbc->count += count + amount; + __this_cpu_sub(*fbc->counters, count); + } + + raw_spin_unlock(&fbc->lock); + local_irq_restore(flags); + return good; +} + static int __init percpu_counter_startup(void) { int ret; diff --git a/mm/shmem.c b/mm/shmem.c index 269cd3c1110f..61b170324e5c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -217,15 +217,15 @@ static int shmem_inode_acct_blocks(struct inode *inode, long pages) might_sleep(); /* when quotas */ if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - pages) > 0) + if (!percpu_counter_limited_add(&sbinfo->used_blocks, + sbinfo->max_blocks, pages)) goto unacct; err = dquot_alloc_block_nodirty(inode, pages); - if (err) + if (err) { + percpu_counter_sub(&sbinfo->used_blocks, pages); goto unacct; - - percpu_counter_add(&sbinfo->used_blocks, pages); + } } else { err = dquot_alloc_block_nodirty(inode, pages); if (err) -- cgit v1.2.3 From 1431996bf9088ee59f8017637ab9a7f89909ae63 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Oct 2023 21:40:09 -0700 Subject: percpu_counter: extend _limited_add() to negative amounts Though tmpfs does not need it, percpu_counter_limited_add() can be twice as useful if it works sensibly with negative amounts (subs) - typically decrements towards a limit of 0 or nearby: as suggested by Dave Chinner. And in the course of that reworking, skip the percpu counter sum if it is already obvious that the limit would be passed: as suggested by Tim Chen. Extend the comment above __percpu_counter_limited_add(), defining the behaviour with positive and negative amounts, allowing negative limits, but not bothering about overflow beyond S64_MAX. Link: https://lkml.kernel.org/r/8f86083b-c452-95d4-365b-f16a2e4ebcd4@google.com Signed-off-by: Hugh Dickins Cc: Axel Rasmussen Cc: Carlos Maiolino Cc: Christian Brauner Cc: Chuck Lever Cc: Darrick J. Wong Cc: Dave Chinner Cc: Jan Kara Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Tim Chen Signed-off-by: Andrew Morton --- include/linux/percpu_counter.h | 11 +++++++-- lib/percpu_counter.c | 54 +++++++++++++++++++++++++++++++----------- 2 files changed, 49 insertions(+), 16 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 8cb7c071bd5c..3a44dd1e33d2 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -198,14 +198,21 @@ static inline bool percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount) { unsigned long flags; + bool good = false; s64 count; + if (amount == 0) + return true; + local_irq_save(flags); count = fbc->count + amount; - if (count <= limit) + if ((amount > 0 && count <= limit) || + (amount < 0 && count >= limit)) { fbc->count = count; + good = true; + } local_irq_restore(flags); - return count <= limit; + return good; } /* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 58a3392f471b..44dd133594d4 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -279,8 +279,16 @@ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) EXPORT_SYMBOL(__percpu_counter_compare); /* - * Compare counter, and add amount if the total is within limit. - * Return true if amount was added, false if it would exceed limit. + * Compare counter, and add amount if total is: less than or equal to limit if + * amount is positive, or greater than or equal to limit if amount is negative. + * Return true if amount is added, or false if total would be beyond the limit. + * + * Negative limit is allowed, but unusual. + * When negative amounts (subs) are given to percpu_counter_limited_add(), + * the limit would most naturally be 0 - but other limits are also allowed. + * + * Overflow beyond S64_MAX is not allowed for: counter, limit and amount + * are all assumed to be sane (far from S64_MIN and S64_MAX). */ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount, s32 batch) @@ -288,10 +296,10 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 count; s64 unknown; unsigned long flags; - bool good; + bool good = false; - if (amount > limit) - return false; + if (amount == 0) + return true; local_irq_save(flags); unknown = batch * num_online_cpus(); @@ -299,7 +307,8 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, /* Skip taking the lock when safe */ if (abs(count + amount) <= batch && - fbc->count + unknown <= limit) { + ((amount > 0 && fbc->count + unknown <= limit) || + (amount < 0 && fbc->count - unknown >= limit))) { this_cpu_add(*fbc->counters, amount); local_irq_restore(flags); return true; @@ -309,7 +318,19 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, count = fbc->count + amount; /* Skip percpu_counter_sum() when safe */ - if (count + unknown > limit) { + if (amount > 0) { + if (count - unknown > limit) + goto out; + if (count + unknown <= limit) + good = true; + } else { + if (count + unknown < limit) + goto out; + if (count - unknown >= limit) + good = true; + } + + if (!good) { s32 *pcount; int cpu; @@ -317,15 +338,20 @@ bool __percpu_counter_limited_add(struct percpu_counter *fbc, pcount = per_cpu_ptr(fbc->counters, cpu); count += *pcount; } + if (amount > 0) { + if (count > limit) + goto out; + } else { + if (count < limit) + goto out; + } + good = true; } - good = count <= limit; - if (good) { - count = __this_cpu_read(*fbc->counters); - fbc->count += count + amount; - __this_cpu_sub(*fbc->counters, count); - } - + count = __this_cpu_read(*fbc->counters); + fbc->count += count + amount; + __this_cpu_sub(*fbc->counters, count); +out: raw_spin_unlock(&fbc->lock); local_irq_restore(flags); return good; -- cgit v1.2.3 From 5d74b2ab2c15d596c470bae6626f345d5575a9d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:13 +0100 Subject: mm: make lock_folio_maybe_drop_mmap() VMA lock aware Patch series "Handle more faults under the VMA lock", v2. At this point, we're handling the majority of file-backed page faults under the VMA lock, using the ->map_pages entry point. This patch set attempts to expand that for the following siutations: - We have to do a read. This could be because we've hit the point in the readahead window where we need to kick off the next readahead, or because the page is simply not present in cache. - We're handling a write fault. Most applications don't do I/O by writes to shared mmaps for very good reasons, but some do, and it'd be nice to not make that slow unnecessarily. - We're doing a COW of a private mapping (both PTE already present and PTE not-present). These are two different codepaths and I handle both of them in this patch set. There is no support in this patch set for drivers to mark themselves as being VMA lock friendly; they could implement the ->map_pages vm_operation, but if they do, they would be the first. This is probably something we want to change at some point in the future, and I've marked where to make that change in the code. There is very little performance change in the benchmarks we've run; mostly because the vast majority of page faults are handled through the other paths. I still think this patch series is useful for workloads that may take these paths more often, and just for cleaning up the fault path in general (it's now clearer why we have to retry in these cases). This patch (of 6): Drop the VMA lock instead of the mmap_lock if that's the one which is held. Link: https://lkml.kernel.org/r/20231006195318.4087158-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231006195318.4087158-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 062c25b184a5..951709089f38 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3090,7 +3090,7 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, /* * NOTE! This will make us return with VM_FAULT_RETRY, but with - * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT + * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT * is supposed to work. We have way too many special cases.. */ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) @@ -3100,13 +3100,14 @@ static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio, if (vmf->flags & FAULT_FLAG_KILLABLE) { if (__folio_lock_killable(folio)) { /* - * We didn't have the right flags to drop the mmap_lock, - * but all fault_handlers only check for fatal signals - * if we return VM_FAULT_RETRY, so we need to drop the - * mmap_lock here and return 0 if we don't have a fpin. + * We didn't have the right flags to drop the + * fault lock, but all fault_handlers only check + * for fatal signals if we return VM_FAULT_RETRY, + * so we need to drop the fault lock here and + * return 0 if we don't have a fpin. */ if (*fpin == NULL) - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); return 0; } } else -- cgit v1.2.3 From 164b06f238b986317131e6b61b2f22aabcbc2cc0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:14 +0100 Subject: mm: call wp_page_copy() under the VMA lock It is usually safe to call wp_page_copy() under the VMA lock. The only unsafe situation is when no anon_vma has been allocated for this VMA, and we have to look at adjacent VMAs to determine if their anon_vma can be shared. Since this happens only for the first COW of a page in this VMA, the majority of calls to wp_page_copy() do not need to fall back to the mmap_sem. Add vmf_anon_prepare() as an alternative to anon_vma_prepare() which will return RETRY if we currently hold the VMA lock and need to allocate an anon_vma. This lets us drop the check in do_wp_page(). Link: https://lkml.kernel.org/r/20231006195318.4087158-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d0f61d729fb4..c437d3562d18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (likely(vma->anon_vma)) + return 0; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (__anon_vma_prepare(vma)) + return VM_FAULT_OOM; + return 0; +} + /* * Handle the case of a page which we actually need to copy to a new page, * either due to COW or unsharing. @@ -3069,27 +3084,29 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; - int ret; + vm_fault_t ret; delayacct_wpcopy_start(); if (vmf->page) old_folio = page_folio(vmf->page); - if (unlikely(anon_vma_prepare(vma))) - goto oom; + ret = vmf_anon_prepare(vmf); + if (unlikely(ret)) + goto out; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_folio = vma_alloc_zeroed_movable_folio(vma, vmf->address); if (!new_folio) goto oom; } else { + int err; new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address, false); if (!new_folio) goto oom; - ret = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); - if (ret) { + err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf); + if (err) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on @@ -3102,7 +3119,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) folio_put(old_folio); delayacct_wpcopy_end(); - return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; + return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(&new_folio->page, vmf->page); } @@ -3212,11 +3229,13 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) oom_free_new: folio_put(new_folio); oom: + ret = VM_FAULT_OOM; +out: if (old_folio) folio_put(old_folio); delayacct_wpcopy_end(); - return VM_FAULT_OOM; + return ret; } /** @@ -3458,12 +3477,6 @@ reuse: return 0; } copy: - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Ok, we need to copy. Oh, well.. */ -- cgit v1.2.3 From 4ed4379881aa62588aba6442a9f362a8cf7624e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:15 +0100 Subject: mm: handle shared faults under the VMA lock There are many implementations of ->fault and some of them depend on mmap_lock being held. All vm_ops that implement ->map_pages() end up calling filemap_fault(), which I have audited to be sure it does not rely on mmap_lock. So (for now) key off ->map_pages existing as a flag to indicate that it's safe to call ->fault while only holding the vma lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c437d3562d18..7b240a67f207 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf) count_vm_event(PGREUSE); } +/* + * We could add a bitflag somewhere, but for now, we know that all + * vm_ops that have a ->map_pages have been audited and don't need + * the mmap_lock to be held. + */ +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + + if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK)) + return 0; + vma_end_read(vma); + return VM_FAULT_RETRY; +} + static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) vm_fault_t ret, tmp; struct folio *folio; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit v1.2.3 From 4de8c93a4751e10737b6af65db42c743228c67a6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:16 +0100 Subject: mm: handle COW faults under the VMA lock If the page is not currently present in the page tables, we need to call the page fault handler to find out which page we're supposed to COW, so we need to both check that there is already an anon_vma and that the fault handler doesn't need the mmap_lock. Link: https://lkml.kernel.org/r/20231006195318.4087158-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7b240a67f207..be9469a29413 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4639,13 +4639,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; + ret = vmf_can_call_fault(vmf); + if (!ret) + ret = vmf_anon_prepare(vmf); + if (ret) + return ret; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) -- cgit v1.2.3 From 12214eba1992642eee5813a9cc9f626e5b2d1815 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:17 +0100 Subject: mm: handle read faults under the VMA lock Most file-backed faults are already handled through ->map_pages(), but if we need to do I/O we'll come this way. Since filemap_fault() is now safe to be called under the VMA lock, we can handle these faults under the VMA lock now. Link: https://lkml.kernel.org/r/20231006195318.4087158-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index be9469a29413..ada461b82a75 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4617,10 +4617,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) -- cgit v1.2.3 From 4a68fef16df9d88d528094116f8bbd2dbfa62089 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 6 Oct 2023 20:53:18 +0100 Subject: mm: handle write faults to RO pages under the VMA lock I think this is a pretty rare occurrence, but for consistency handle faults with the VMA lock held the same way that we handle other faults with the VMA lock held. Link: https://lkml.kernel.org/r/20231006195318.4087158-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index ada461b82a75..ceffe96f2a0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3301,10 +3301,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } + ret = vmf_can_call_fault(vmf); + if (ret) + return ret; vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); @@ -3328,10 +3327,10 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + tmp = vmf_can_call_fault(vmf); + if (tmp) { folio_put(folio); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; + return tmp; } tmp = do_page_mkwrite(vmf, folio); -- cgit v1.2.3 From 5ca432896a4ce6d69fffc3298b24c0dd9bdb871f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:47 +0200 Subject: mm/rmap: move SetPageAnonExclusive() out of page_move_anon_rmap() Patch series "mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap()". Convert page_move_anon_rmap() to folio_move_anon_rmap(), letting the callers handle PageAnonExclusive. I'm including cleanup patch #3 because it fits into the picture and can be done cleaner by the conversion. This patch (of 3): Let's move it into the caller: there is a difference between whether an anon folio can only be mapped by one process (e.g., into one VMA), and whether it is truly exclusive (e.g., no references -- including GUP -- from other processes). Further, for large folios the page might not actually be pointing at the head page of the folio, so it better be handled in the caller. This is a preparation for converting page_move_anon_rmap() to consume a folio. Link: https://lkml.kernel.org/r/20231002142949.235104-1-david@redhat.com Link: https://lkml.kernel.org/r/20231002142949.235104-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/huge_memory.c | 1 + mm/hugetlb.c | 4 +++- mm/memory.c | 1 + mm/rmap.c | 1 - 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index aa0224556132..76ead290f1c8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1377,6 +1377,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) pmd_t entry; page_move_anon_rmap(page, vma); + SetPageAnonExclusive(page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2878e0e6bac5..35d924f74972 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5652,8 +5652,10 @@ retry_avoidcopy: * owner and can reuse this page. */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { - if (!PageAnonExclusive(&old_folio->page)) + if (!PageAnonExclusive(&old_folio->page)) { page_move_anon_rmap(&old_folio->page, vma); + SetPageAnonExclusive(&old_folio->page); + } if (likely(!unshare)) set_huge_ptep_writable(vma, haddr, ptep); diff --git a/mm/memory.c b/mm/memory.c index ceffe96f2a0e..21fba1c9a6c7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3481,6 +3481,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * sunglasses. Hit it. */ page_move_anon_rmap(vmf->page, vma); + SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: if (unlikely(unshare)) { diff --git a/mm/rmap.c b/mm/rmap.c index c6bb2339e35b..6f1ea1491118 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1152,7 +1152,6 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) * folio_test_anon()) will not see one without the other. */ WRITE_ONCE(folio->mapping, anon_vma); - SetPageAnonExclusive(page); } /** -- cgit v1.2.3 From 069686255c16a75b6a796e42df47f5af27b496a4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:48 +0200 Subject: mm/rmap: convert page_move_anon_rmap() to folio_move_anon_rmap() Let's convert it to consume a folio. [akpm@linux-foundation.org: fix kerneldoc] Link: https://lkml.kernel.org/r/20231002142949.235104-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Suren Baghdasaryan Reviewed-by: Vishal Moola (Oracle) Cc: Mike Kravetz Cc: Muchun Song Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 +- mm/hugetlb.c | 2 +- mm/memory.c | 2 +- mm/rmap.c | 16 +++++++--------- 5 files changed, 11 insertions(+), 13 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d22f4d21a11c..b26fe858fd44 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -189,7 +189,7 @@ typedef int __bitwise rmap_t; /* * rmap interfaces called when adding or removing pte of page */ -void page_move_anon_rmap(struct page *, struct vm_area_struct *); +void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long address, rmap_t flags); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76ead290f1c8..51a66eb48938 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,7 +1376,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) if (folio_ref_count(folio) == 1) { pmd_t entry; - page_move_anon_rmap(page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(page); folio_unlock(folio); reuse: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 35d924f74972..7ad9d2159da4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5653,7 +5653,7 @@ retry_avoidcopy: */ if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) { if (!PageAnonExclusive(&old_folio->page)) { - page_move_anon_rmap(&old_folio->page, vma); + folio_move_anon_rmap(old_folio, vma); SetPageAnonExclusive(&old_folio->page); } if (likely(!unshare)) diff --git a/mm/memory.c b/mm/memory.c index 21fba1c9a6c7..6c67828f934c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3480,7 +3480,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(vmf->page, vma); + folio_move_anon_rmap(folio, vma); SetPageAnonExclusive(vmf->page); folio_unlock(folio); reuse: diff --git a/mm/rmap.c b/mm/rmap.c index 6f1ea1491118..7a27a2b41802 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1128,19 +1128,17 @@ int folio_total_mapcount(struct folio *folio) } /** - * page_move_anon_rmap - move a page to our anon_vma - * @page: the page to move to our anon_vma - * @vma: the vma the page belongs to + * folio_move_anon_rmap - move a folio to our anon_vma + * @folio: The folio to move to our anon_vma + * @vma: The vma the folio belongs to * - * When a page belongs exclusively to one process after a COW event, - * that page can be moved into the anon_vma that belongs to just that - * process, so the rmap code will not search the parent or sibling - * processes. + * When a folio belongs exclusively to one process after a COW event, + * that folio can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling processes. */ -void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) +void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma) { void *anon_vma = vma->anon_vma; - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); -- cgit v1.2.3 From dec078cc2181fccf8b134406b86aaacc19f7163f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 2 Oct 2023 16:29:49 +0200 Subject: memory: move exclusivity detection in do_wp_page() into wp_can_reuse_anon_folio() Let's clean up do_wp_page() a bit, removing two labels and making it a easier to read. wp_can_reuse_anon_folio() now only operates on the whole folio. Move the SetPageAnonExclusive() out into do_wp_page(). No need to do this under page lock -- the page table lock is sufficient. Link: https://lkml.kernel.org/r/20231002142949.235104-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Mike Kravetz Cc: Muchun Song Cc: Suren Baghdasaryan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 88 +++++++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6c67828f934c..22784d3b886d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3355,6 +3355,44 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) return ret; } +static bool wp_can_reuse_anon_folio(struct folio *folio, + struct vm_area_struct *vma) +{ + /* + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing + * the swapcache if there is little hope that we can reuse. + * + * KSM doesn't necessarily raise the folio refcount. + */ + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) + return false; + if (!folio_test_lru(folio)) + /* + * We cannot easily detect+handle references from + * remote LRU caches or references to LRU folios. + */ + lru_add_drain(); + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) + return false; + if (!folio_trylock(folio)) + return false; + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); + return false; + } + /* + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing + * sunglasses. Hit it. + */ + folio_move_anon_rmap(folio, vma); + folio_unlock(folio); + return true; +} + /* * This routine handles present pages, when * * users try to write to a shared page (FAULT_FLAG_WRITE) @@ -3441,49 +3479,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. + * + * If we encounter a page that is marked exclusive, we must reuse + * the page without further checks. */ - if (folio && folio_test_anon(folio)) { - /* - * If the page is exclusive to this process we must reuse the - * page without further checks. - */ - if (PageAnonExclusive(vmf->page)) - goto reuse; - - /* - * We have to verify under folio lock: these early checks are - * just an optimization to avoid locking the folio and freeing - * the swapcache if there is little hope that we can reuse. - * - * KSM doesn't necessarily raise the folio refcount. - */ - if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) - goto copy; - if (!folio_test_lru(folio)) - /* - * We cannot easily detect+handle references from - * remote LRU caches or references to LRU folios. - */ - lru_add_drain(); - if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) - goto copy; - if (!folio_trylock(folio)) - goto copy; - if (folio_test_swapcache(folio)) - folio_free_swap(folio); - if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { - folio_unlock(folio); - goto copy; - } - /* - * Ok, we've got the only folio reference from our mapping - * and the folio is locked, it's dark out, and we're wearing - * sunglasses. Hit it. - */ - folio_move_anon_rmap(folio, vma); - SetPageAnonExclusive(vmf->page); - folio_unlock(folio); -reuse: + if (folio && folio_test_anon(folio) && + (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) { + if (!PageAnonExclusive(vmf->page)) + SetPageAnonExclusive(vmf->page); if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; @@ -3491,7 +3494,6 @@ reuse: wp_page_reuse(vmf); return 0; } -copy: /* * Ok, we need to copy. Oh, well.. */ -- cgit v1.2.3 From ec47e250628903d48d9ff490c6b532df889bcaa3 Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Tue, 3 Oct 2023 10:48:57 -0400 Subject: mm/migrate: remove unused mm argument from do_move_pages_to_node This function does not actively use the mm_struct, it can be removed. Link: https://lkml.kernel.org/r/20231003144857.752952-2-gregory.price@memverge.com Signed-off-by: Gregory Price Reviewed-by: Jonathan Cameron Cc: Arnd Bergmann Cc: Gregory Price Signed-off-by: Andrew Morton --- mm/migrate.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 482f478ea3fe..9d9eee5322d1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2029,8 +2029,7 @@ static int store_status(int __user *status, int start, int value, int nr) return 0; } -static int do_move_pages_to_node(struct mm_struct *mm, - struct list_head *pagelist, int node) +static int do_move_pages_to_node(struct list_head *pagelist, int node) { int err; struct migration_target_control mtc = { @@ -2119,7 +2118,7 @@ out: return err; } -static int move_pages_and_store_status(struct mm_struct *mm, int node, +static int move_pages_and_store_status(int node, struct list_head *pagelist, int __user *status, int start, int i, unsigned long nr_pages) { @@ -2128,7 +2127,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node, if (list_empty(pagelist)) return 0; - err = do_move_pages_to_node(mm, pagelist, node); + err = do_move_pages_to_node(pagelist, node); if (err) { /* * Positive err means the number of failed @@ -2196,7 +2195,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = node; start = i; } else if (node != current_node) { - err = move_pages_and_store_status(mm, current_node, + err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) goto out; @@ -2231,7 +2230,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, if (err) goto out_flush; - err = move_pages_and_store_status(mm, current_node, &pagelist, + err = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err) { /* We have accounted for page i */ @@ -2243,7 +2242,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, } out_flush: /* Make sure we do not overwrite the existing error */ - err1 = move_pages_and_store_status(mm, current_node, &pagelist, + err1 = move_pages_and_store_status(current_node, &pagelist, status, start, i, nr_pages); if (err >= 0) err = err1; -- cgit v1.2.3 From 8c2214fc9a470aee0c615aeb14d8c7ce98e45a08 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Tue, 3 Oct 2023 20:41:55 +0900 Subject: mm: multi-gen LRU: reuse some legacy trace events As the legacy lru provides, the mglru needs some trace events for debugging. Let's reuse following legacy events for the mglru. trace_mm_vmscan_lru_isolate trace_mm_vmscan_lru_shrink_inactive Here's an example mm_vmscan_lru_isolate: classzone=2 order=0 nr_requested=4096 nr_scanned=64 nr_skipped=0 nr_taken=64 lru=inactive_file mm_vmscan_lru_shrink_inactive: nid=0 nr_scanned=64 nr_reclaimed=63 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=1 nr_ref_keep=0 nr_unmap_fail=0 priority=2 flags=RECLAIM_WB_FILE|RECLAIM_WB_ASYNC Link: https://lkml.kernel.org/r/20231003114155.21869-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Acked-by: Yu Zhao Cc: Johannes Weiner Cc: Kalesh Singh Cc: SeongJae Park Cc: Steven Rostedt (Google) Cc: T.J. Mercier Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 55424215f759..506f8220c5fe 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4304,6 +4304,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int sorted = 0; int scanned = 0; int isolated = 0; + int skipped = 0; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); @@ -4317,7 +4318,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); - int skipped = 0; + int skipped_zone = 0; int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; @@ -4339,16 +4340,17 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, isolated += delta; } else { list_move(&folio->lru, &moved); - skipped += delta; + skipped_zone += delta; } - if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + if (!--remaining || max(isolated, skipped_zone) >= MIN_LRU_BATCH) break; } - if (skipped) { + if (skipped_zone) { list_splice(&moved, head); - __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped_zone); + skipped += skipped_zone; } if (!remaining || isolated >= MIN_LRU_BATCH) @@ -4363,6 +4365,9 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_memcg_events(memcg, item, isolated); __count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); + trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, + scanned, skipped, isolated, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); /* * There might not be eligible folios due to reclaim_idx. Check the @@ -4493,6 +4498,9 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); sc->nr_reclaimed += reclaimed; + trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, + scanned, reclaimed, &stat, sc->priority, + type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { if (!folio_evictable(folio)) { -- cgit v1.2.3 From c43cfa42541c04a3a94312e39ab81c41ba431277 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:51 +0100 Subject: mm: make __access_remote_vm() static Patch series "various improvements to the GUP interface", v2. A series of fixes to simplify and improve the GUP interface with an eye to providing groundwork to future improvements:- * __access_remote_vm() and access_remote_vm() are functionally identical, so make the former static such that in future we can potentially change the external-facing implementation details of this function. * Extend is_valid_gup_args() to cover the missing FOLL_TOUCH case, and simplify things by defining INTERNAL_GUP_FLAGS to check against. * Adjust __get_user_pages_locked() to explicitly treat a failure to pin any pages as an error in all circumstances other than FOLL_NOWAIT being specified, bringing it in line with the nommu implementation of this function. * (With many thanks to Arnd who suggested this in the first instance) Update get_user_page_vma_remote() to explicitly only return a page or an error, simplifying the interface and avoiding the questionable IS_ERR_OR_NULL() pattern. This patch (of 4): access_remote_vm() passes through parameters to __access_remote_vm() directly, so remove the __access_remote_vm() function from mm.h and use access_remote_vm() in the one caller that needs it (ptrace_access_vm()). This allows future adjustments to the GUP-internal __access_remote_vm() function while keeping the access_remote_vm() function stable. Link: https://lkml.kernel.org/r/cover.1696288092.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/f7877c5039ce1c202a514a8aeeefc5cdd5e32d19.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- kernel/ptrace.c | 2 +- mm/memory.c | 4 ++-- mm/nommu.c | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 52c40b3d0813..7b89f7bd420d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2415,8 +2415,6 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, - void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 443057bee87c..d8b5e13a2229 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -59,7 +59,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(mm, addr, buf, len, gup_flags); + ret = access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; diff --git a/mm/memory.c b/mm/memory.c index 22784d3b886d..e47c36c0aef0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5791,8 +5791,8 @@ EXPORT_SYMBOL_GPL(generic_access_phys); /* * Access another process' address space as given in mm. */ -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { void *old_buf = buf; int write = gup_flags & FOLL_WRITE; diff --git a/mm/nommu.c b/mm/nommu.c index 7f9e9e5a0e12..f9553579389b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1651,8 +1651,8 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, - int len, unsigned int gup_flags) +static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; -- cgit v1.2.3 From 0f20bba1688bdf3b32df0162511a67d4eda15790 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:52 +0100 Subject: mm/gup: explicitly define and check internal GUP flags, disallow FOLL_TOUCH Rather than open-coding a list of internal GUP flags in is_valid_gup_args(), define which ones are internal. In addition, explicitly check to see if the user passed in FOLL_TOUCH somehow, as this appears to have been accidentally excluded. Link: https://lkml.kernel.org/r/971e013dfe20915612ea8b704e801d7aef9a66b6.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 5 ++--- mm/internal.h | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2f8a2d89fde1..b21b33d1787e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2227,12 +2227,11 @@ static bool is_valid_gup_args(struct page **pages, int *locked, /* * These flags not allowed to be specified externally to the gup * interfaces: - * - FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only + * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only * - FOLL_REMOTE is internal only and used on follow_page() * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL */ - if (WARN_ON_ONCE(gup_flags & (FOLL_PIN | FOLL_TRIED | FOLL_UNLOCKABLE | - FOLL_REMOTE | FOLL_FAST_ONLY))) + if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS)) return false; gup_flags |= to_set; diff --git a/mm/internal.h b/mm/internal.h index 5a5c923725d3..2b79da9deb64 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1017,6 +1017,9 @@ enum { FOLL_UNLOCKABLE = 1 << 21, }; +#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \ + FOLL_FAST_ONLY | FOLL_UNLOCKABLE) + /* * Indicates for which pages that are write-protected in the page table, * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the -- cgit v1.2.3 From 9c4b21422507035f3e0a507a680c9b03c0bcc730 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:53 +0100 Subject: mm/gup: make failure to pin an error if FOLL_NOWAIT not specified There really should be no circumstances under which a non-FOLL_NOWAIT GUP operation fails to return any pages, so make this an error and warn on it. To catch the trivial case, simply exit early if nr_pages == 0. This brings __get_user_pages_locked() in line with the behaviour of its nommu variant. Link: https://lkml.kernel.org/r/2a42d96dd1e37163f90a0019a541163dafb7e4c3.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Arnd Bergmann Reviewed-by: David Hildenbrand Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Catalin Marinas Cc: Ian Rogers Cc: Ingo Molnar Cc: Jason Gunthorpe Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index b21b33d1787e..231711efa390 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1471,6 +1471,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, long ret, pages_done; bool must_unlock = false; + if (!nr_pages) + return 0; + /* * The internal caller expects GUP to manage the lock internally and the * lock must be released when this returns. @@ -1595,6 +1598,14 @@ retry: mmap_read_unlock(mm); *locked = 0; } + + /* + * Failing to pin anything implies something has gone wrong (except when + * FOLL_NOWAIT is specified). + */ + if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT))) + return -EFAULT; + return pages_done; } -- cgit v1.2.3 From 6a1960b8a8773324d870fa32ba68ff3106523a95 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 3 Oct 2023 00:14:54 +0100 Subject: mm/gup: adapt get_user_page_vma_remote() to never return NULL get_user_pages_remote() will never return 0 except in the case of FOLL_NOWAIT being specified, which we explicitly disallow. This simplifies error handling for the caller and avoids the awkwardness of dealing with both errors and failing to pin. Failing to pin here is an error. Link: https://lkml.kernel.org/r/00319ce292d27b3aae76a0eb220ce3f528187508.1696288092.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Acked-by: Catalin Marinas Reviewed-by: David Hildenbrand Reviewed-by: Jason Gunthorpe Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: John Hubbard Cc: Mark Rutland Cc: Namhyung Kim Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Cochran Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/mte.c | 4 ++-- include/linux/mm.h | 12 +++++++++--- kernel/events/uprobes.c | 4 ++-- mm/memory.c | 3 +-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4edecaac8f91..8878b392df58 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -411,8 +411,8 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { - err = page == NULL ? -EIO : PTR_ERR(page); + if (IS_ERR(page)) { + err = PTR_ERR(page); break; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b89f7bd420d..fa608cba041f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2425,6 +2425,9 @@ long pin_user_pages_remote(struct mm_struct *mm, unsigned int gup_flags, struct page **pages, int *locked); +/* + * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT. + */ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, unsigned long addr, int gup_flags, @@ -2432,12 +2435,15 @@ static inline struct page *get_user_page_vma_remote(struct mm_struct *mm, { struct page *page; struct vm_area_struct *vma; - int got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); + int got; + + if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT))) + return ERR_PTR(-EINVAL); + + got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL); if (got < 0) return ERR_PTR(got); - if (got == 0) - return NULL; vma = vma_lookup(mm, addr); if (WARN_ON_ONCE(!vma)) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3048589e2e85..435aac1d8c27 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -474,8 +474,8 @@ retry: gup_flags |= FOLL_SPLIT_PMD; /* Read the page with vaddr into memory */ old_page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma); - if (IS_ERR_OR_NULL(old_page)) - return old_page ? PTR_ERR(old_page) : 0; + if (IS_ERR(old_page)) + return PTR_ERR(old_page); ret = verify_opcode(old_page, vaddr, &opcode); if (ret <= 0) diff --git a/mm/memory.c b/mm/memory.c index e47c36c0aef0..1f88e4f6fbf2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5815,7 +5815,7 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, struct page *page = get_user_page_vma_remote(mm, addr, gup_flags, &vma); - if (IS_ERR_OR_NULL(page)) { + if (IS_ERR(page)) { /* We might need to expand the stack to access it */ vma = vma_lookup(mm, addr); if (!vma) { @@ -5829,7 +5829,6 @@ static int __access_remote_vm(struct mm_struct *mm, unsigned long addr, continue; } - /* * Check if this is a VM_IO | VM_PFNMAP VMA, which * we can access using slightly different code. -- cgit v1.2.3 From 416a616e5481b3757a3a6f1ccdb15a2b75c26dca Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:42 +0200 Subject: arm64, kasan: update comment in kasan_init Patch series "kasan: assorted fixes and improvements". This patch (of 5): Update the comment in kasan_init to also mention the Hardware Tag-Based KASAN mode. Link: https://lkml.kernel.org/r/cover.1696605143.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/4186aefd368b019eaf27c907c4fa692a89448d66.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/arm64/mm/kasan_init.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index f17d066e85eb..555285ebd5af 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -300,7 +300,11 @@ void __init kasan_init(void) kasan_init_shadow(); kasan_init_depth(); #if defined(CONFIG_KASAN_GENERIC) - /* CONFIG_KASAN_SW_TAGS also requires kasan_init_sw_tags(). */ + /* + * Generic KASAN is now fully initialized. + * Software and Hardware Tag-Based modes still require + * kasan_init_sw_tags() and kasan_init_hw_tags() correspondingly. + */ pr_info("KernelAddressSanitizer initialized (generic)\n"); #endif } -- cgit v1.2.3 From d7196d87a1559db9ece24852e6167061b199d58d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:43 +0200 Subject: kasan: unify printk prefixes Unify prefixes for printk messages in mm/kasan/. Link: https://lkml.kernel.org/r/35589629806cf0840e5f01ec9d8011a7bad648df.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 2 +- mm/kasan/kasan_test_module.c | 2 +- mm/kasan/quarantine.c | 4 +++- mm/kasan/report_generic.c | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index b61cc6a42541..c707d6c6e019 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin */ -#define pr_fmt(fmt) "kasan_test: " fmt +#define pr_fmt(fmt) "kasan: test: " fmt #include #include diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index 7be7bed456ef..8b7b3ea2c74e 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -5,7 +5,7 @@ * Author: Andrey Ryabinin */ -#define pr_fmt(fmt) "kasan test: %s " fmt, __func__ +#define pr_fmt(fmt) "kasan: test: " fmt #include #include diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 152dca73f398..ca4529156735 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -8,6 +8,8 @@ * Based on code by Dmitry Chernenkov. */ +#define pr_fmt(fmt) "kasan: " fmt + #include #include #include @@ -414,7 +416,7 @@ static int __init kasan_cpu_quarantine_init(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online", kasan_cpu_online, kasan_cpu_offline); if (ret < 0) - pr_err("kasan cpu quarantine register failed [%d]\n", ret); + pr_err("cpu quarantine register failed [%d]\n", ret); return ret; } late_initcall(kasan_cpu_quarantine_init); diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 51a1e8a8877f..99cbcd73cff7 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -220,7 +220,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, const size_t tok_len = sep - *frame_descr; if (tok_len + 1 > max_tok_len) { - pr_err("KASAN internal error: frame description too long: %s\n", + pr_err("internal error: frame description too long: %s\n", *frame_descr); return false; } @@ -233,7 +233,7 @@ static bool __must_check tokenize_frame_descr(const char **frame_descr, *frame_descr = sep + 1; if (value != NULL && kstrtoul(token, 10, value)) { - pr_err("KASAN internal error: not a valid number: %s\n", token); + pr_err("internal error: not a valid number: %s\n", token); return false; } @@ -323,7 +323,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr, frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE); if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { - pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + pr_err("internal error: frame has invalid marker: %lu\n", frame[0]); return false; } -- cgit v1.2.3 From 01a5ad81637672940844052404678678a0ec8854 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:44 +0200 Subject: kasan: use unchecked __memset internally KASAN code is supposed to use the unchecked __memset implementation when accessing its metadata. Change uses of memset to __memset in mm/kasan/. Link: https://lkml.kernel.org/r/6f621966c6f52241b5aaa7220c348be90c075371.1696605143.git.andreyknvl@google.com Fixes: 59e6e098d1c1 ("kasan: introduce kasan_complete_mode_report_info") Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: kernel test robot Signed-off-by: Andrew Morton --- mm/kasan/report.c | 4 ++-- mm/kasan/shadow.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 6e3cb118d20e..e77facb62900 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -538,7 +538,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); - memset(&info, 0, sizeof(info)); + __memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; info.access_size = 0; @@ -576,7 +576,7 @@ bool kasan_report(const void *addr, size_t size, bool is_write, start_report(&irq_flags, true); - memset(&info, 0, sizeof(info)); + __memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = addr; info.access_size = size; diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index dd772f9d0f08..d687f09a7ae3 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -324,7 +324,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, if (!page) return -ENOMEM; - memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + __memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); spin_lock(&init_mm.page_table_lock); -- cgit v1.2.3 From ff093a9632d9de9ff66dfd9db211008138520e13 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:45 +0200 Subject: kasan: fix and update KUNIT_EXPECT_KASAN_FAIL comment Update the comment for KUNIT_EXPECT_KASAN_FAIL to describe the parameters this macro accepts. Also drop the mention of the "kasan_status" KUnit resource, as it no longer exists. Link: https://lkml.kernel.org/r/6fad6661e72c407450ae4b385c71bc4a7e1579cd.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308171757.7V5YUcje-lkp@intel.com/ Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index c707d6c6e019..2030c7ff7de9 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -91,10 +91,11 @@ static void kasan_test_exit(struct kunit *test) } /** - * KUNIT_EXPECT_KASAN_FAIL() - check that the executed expression produces a - * KASAN report; causes a test failure otherwise. This relies on a KUnit - * resource named "kasan_status". Do not use this name for KUnit resources - * outside of KASAN tests. + * KUNIT_EXPECT_KASAN_FAIL - check that the executed expression produces a + * KASAN report; causes a KUnit test failure otherwise. + * + * @test: Currently executing KUnit test. + * @expression: Expression that must produce a KASAN report. * * For hardware tag-based KASAN, when a synchronous tag fault happens, tag * checking is auto-disabled. When this happens, this test handler reenables -- cgit v1.2.3 From 651acf0ceb72903ff16fe31e52c1c448a0ca880c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Fri, 6 Oct 2023 17:18:46 +0200 Subject: Documentation: *san: drop "the" from article titles Drop "the" from the titles of documentation articles for KASAN, KCSAN, and KMSAN, as it is redundant. Also add SPDX-License-Identifier for kasan.rst. Link: https://lkml.kernel.org/r/1c4eb354a3a7b8ab56bf0c2fc6157c22050793ca.1696605143.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: kernel test robot Cc: Marco Elver Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 7 +++++-- Documentation/dev-tools/kcsan.rst | 4 ++-- Documentation/dev-tools/kmsan.rst | 6 +++--- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 382818a7197a..858c77fe7dc4 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -1,5 +1,8 @@ -The Kernel Address Sanitizer (KASAN) -==================================== +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2023, Google LLC. + +Kernel Address Sanitizer (KASAN) +================================ Overview -------- diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst index 3ae866dcc924..94b6802ab0ab 100644 --- a/Documentation/dev-tools/kcsan.rst +++ b/Documentation/dev-tools/kcsan.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 .. Copyright (C) 2019, Google LLC. -The Kernel Concurrency Sanitizer (KCSAN) -======================================== +Kernel Concurrency Sanitizer (KCSAN) +==================================== The Kernel Concurrency Sanitizer (KCSAN) is a dynamic race detector, which relies on compile-time instrumentation, and uses a watchpoint-based sampling diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst index 55fa82212eb2..323eedad53cd 100644 --- a/Documentation/dev-tools/kmsan.rst +++ b/Documentation/dev-tools/kmsan.rst @@ -1,9 +1,9 @@ .. SPDX-License-Identifier: GPL-2.0 .. Copyright (C) 2022, Google LLC. -=================================== -The Kernel Memory Sanitizer (KMSAN) -=================================== +=============================== +Kernel Memory Sanitizer (KMSAN) +=============================== KMSAN is a dynamic error detector aimed at finding uses of uninitialized values. It is based on compiler instrumentation, and is quite similar to the -- cgit v1.2.3 From 9a12d103f7d2d524f5e3d4358b9a9c03e4a9f1d2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 29 Sep 2023 14:30:41 -0400 Subject: mmap: add clarifying comment to vma_merge() code When tracing through the code in vma_merge(), it was not completely clear why the error return to a dup_anon_vma() call would not overwrite a previous attempt to the same function. This commit adds a comment specifying why it is safe. Link: https://lkml.kernel.org/r/20230929183041.2835469-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Jann Horn Link: https://lore.kernel.org/linux-mm/CAG48ez3iDwFPR=Ed1BfrNuyUJPMK_=StjxhUsCkL6po1s7bONg@mail.gmail.com/ Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mmap.c b/mm/mmap.c index 011ba48d9e72..673429ee8a9e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -943,6 +943,11 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_start_write(curr); remove = curr; remove2 = next; + /* + * Note that the dup_anon_vma below cannot overwrite err + * since the first caller would do nothing unless next + * has an anon_vma. + */ if (!next->anon_vma) err = dup_anon_vma(prev, curr, &anon_dup); } -- cgit v1.2.3 From 27e0db3c21aaf1422980e64b77956e15b839306f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 27 Sep 2023 18:35:13 +0800 Subject: mm/page_alloc: remove unnecessary check in break_down_buddy_pages Patch series "Two minor cleanups to break_down_buddy_pages", v2. Two minor cleanups to break_down_buddy_pages. This patch (of 2): 1. We always have target in range started with next_page and full free range started with current_buddy. 2. The last split range size is 1 << low and low should be >= 0, then size >= 1. So page + size != page is always true (because size > 0). As summary, current_page will not equal to target page. Link: https://lkml.kernel.org/r/20230927103514.98281-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230927103514.98281-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 85741403948f..fdb68b7c8240 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6480,10 +6480,8 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, if (set_page_guard(zone, current_buddy, high, migratetype)) continue; - if (current_buddy != target) { - add_to_free_list(current_buddy, zone, high, migratetype); - set_buddy_order(current_buddy, high); - } + add_to_free_list(current_buddy, zone, high, migratetype); + set_buddy_order(current_buddy, high); } } -- cgit v1.2.3 From 0dfca313a009c83e2ad44b3719dc1222df6c6db5 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 27 Sep 2023 18:35:14 +0800 Subject: mm/page_alloc: remove unnecessary next_page in break_down_buddy_pages The next_page is only used to forward page in case target is in second half range. Move forward page directly to remove unnecessary next_page. Link: https://lkml.kernel.org/r/20230927103514.98281-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Naoya Horiguchi Cc: Matthew Wilcox (Oracle) Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fdb68b7c8240..f46e519618a0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6462,20 +6462,18 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page, int migratetype) { unsigned long size = 1 << high; - struct page *current_buddy, *next_page; + struct page *current_buddy; while (high > low) { high--; size >>= 1; if (target >= &page[size]) { - next_page = page + size; current_buddy = page; + page = page + size; } else { - next_page = page; current_buddy = page + size; } - page = next_page; if (set_page_guard(zone, current_buddy, high, migratetype)) continue; -- cgit v1.2.3 From bafd7e9d353ea091958d0dc702390add34b480e9 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Fri, 6 Oct 2023 13:01:20 +0200 Subject: filemap: call filemap_get_folios_tag() from filemap_get_folios() filemap_get_folios() is filemap_get_folios_tag() with XA_PRESENT as the tag that is being matched. Return filemap_get_folios_tag() with XA_PRESENT as the tag instead of duplicating the code in filemap_get_folios(). No functional changes. Link: https://lkml.kernel.org/r/20231006110120.136809-1-kernel@pankajraghav.com Signed-off-by: Pankaj Raghav Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 45 ++++++++------------------------------------- 1 file changed, 8 insertions(+), 37 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 951709089f38..cd872fbc6086 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2120,48 +2120,13 @@ put: * index @start and up to index @end (inclusive). The folios are returned * in @fbatch with an elevated reference count. * - * The first folio may start before @start; if it does, it will contain - * @start. The final folio may extend beyond @end; if it does, it will - * contain @end. The folios have ascending indices. There may be gaps - * between the folios if there are indices which have no folio in the - * page cache. If folios are added to or removed from the page cache - * while this is running, they may or may not be found by this call. - * * Return: The number of folios which were found. * We also update @start to index the next folio for the traversal. */ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch) { - XA_STATE(xas, &mapping->i_pages, *start); - struct folio *folio; - - rcu_read_lock(); - while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { - /* Skip over shadow, swap and DAX entries */ - if (xa_is_value(folio)) - continue; - if (!folio_batch_add(fbatch, folio)) { - unsigned long nr = folio_nr_pages(folio); - *start = folio->index + nr; - goto out; - } - } - - /* - * We come here when there is no page beyond @end. We take care to not - * overflow the index @start as it confuses some of the callers. This - * breaks the iteration when there is a page at index -1 but that is - * already broken anyway. - */ - if (end == (pgoff_t)-1) - *start = (pgoff_t)-1; - else - *start = end + 1; -out: - rcu_read_unlock(); - - return folio_batch_count(fbatch); + return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch); } EXPORT_SYMBOL(filemap_get_folios); @@ -2240,7 +2205,13 @@ EXPORT_SYMBOL(filemap_get_folios_contig); * @tag: The tag index * @fbatch: The batch to fill * - * Same as filemap_get_folios(), but only returning folios tagged with @tag. + * The first folio may start before @start; if it does, it will contain + * @start. The final folio may extend beyond @end; if it does, it will + * contain @end. The folios have ascending indices. There may be gaps + * between the folios if there are indices which have no folio in the + * page cache. If folios are added to or removed from the page cache + * while this is running, they may or may not be found by this call. + * Only returns folios that are tagged with @tag. * * Return: The number of folios found. * Also update @start to index the next folio for traversal. -- cgit v1.2.3 From afb2d666d0250e707eef3c5947165b414268244b Mon Sep 17 00:00:00 2001 From: Mark-PK Tsai Date: Fri, 6 Oct 2023 14:02:40 +0800 Subject: zsmalloc: use copy_page for full page copy Some architectures have implemented optimized copy_page for full page copying, such as arm. On my arm platform, use the copy_page helper for single page copying is about 10 percent faster than memcpy. Link: https://lkml.kernel.org/r/20231006060245.7411-1-mark-pk.tsai@mediatek.com Signed-off-by: Mark-PK Tsai Reviewed-by: Sergey Senozhatsky Cc: AngeloGioacchino Del Regno Cc: Matthias Brugger Cc: Minchan Kim Cc: YJ Chiang Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c743ce7a5f49..b1c0dad7f4cf 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1839,7 +1839,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * Here, any user cannot access all objects in the zspage so let's move. */ d_addr = kmap_atomic(newpage); - memcpy(d_addr, s_addr, PAGE_SIZE); + copy_page(d_addr, s_addr); kunmap_atomic(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; -- cgit v1.2.3 From c8b90731427814b1e265c3c2bc01c38406963c32 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Oct 2023 09:39:21 -0700 Subject: selftests/mm: export get_free_hugepages() Patch series "New selftest for mm", v2. This is a simple test case that reproduces an mm problem[1], where a page fault races with madvise(), and it is not trivial to reproduce and debug. This test-case aims to avoid such race problems from happening again, impacting workloads that leverages external allocators, such as tcmalloc, jemalloc, etc. [1] https://lore.kernel.org/all/20231001005659.2185316-1-riel@surriel.com/#r This patch (of 2): get_free_hugepages() is helpful for other hugepage tests. Export it to the common file (vm_util.c) to be reused. Link: https://lkml.kernel.org/r/20231005163922.87568-1-leitao@debian.org Link: https://lkml.kernel.org/r/20231005163922.87568-2-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Rik van Riel Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb-madvise.c | 19 ------------------- tools/testing/selftests/mm/vm_util.c | 19 +++++++++++++++++++ tools/testing/selftests/mm/vm_util.h | 1 + 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb-madvise.c b/tools/testing/selftests/mm/hugetlb-madvise.c index d55322df4b73..f32d99565c5e 100644 --- a/tools/testing/selftests/mm/hugetlb-madvise.c +++ b/tools/testing/selftests/mm/hugetlb-madvise.c @@ -36,25 +36,6 @@ unsigned long huge_page_size; unsigned long base_page_size; -unsigned long get_free_hugepages(void) -{ - unsigned long fhp = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return fhp; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) - break; - } - - free(line); - fclose(f); - return fhp; -} - void write_fault_pages(void *addr, unsigned long nr_pages) { unsigned long i; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index 558c9cd8901c..3082b40492dd 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -269,3 +269,22 @@ int uffd_unregister(int uffd, void *addr, uint64_t len) return ret; } + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index c7fa61f0dff8..c02990bbd56f 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -51,6 +51,7 @@ int uffd_register(int uffd, void *addr, uint64_t len, int uffd_unregister(int uffd, void *addr, uint64_t len); int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, bool miss, bool wp, bool minor, uint64_t *ioctls); +unsigned long get_free_hugepages(void); /* * On ppc64 this will only work with radix 2M hugepage size -- cgit v1.2.3 From 116d57303a051bb2c7939a5026e441d8a7845db2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 5 Oct 2023 09:39:22 -0700 Subject: selftests/mm: add a new test for madv and hugetlb Create a selftest that exercises the race between page faults and madvise(MADV_DONTNEED) in the same huge page. Do it by running two threads that touches the huge page and madvise(MADV_DONTNEED) at the same time. In case of a SIGBUS coming at pagefault, the test should fail, since we hit the bug. The test doesn't have a signal handler, and if it fails, it fails like the following ---------------------------------- running ./hugetlb_fault_after_madv ---------------------------------- ./run_vmtests.sh: line 186: 595563 Bus error (core dumped) "$@" [FAIL] This selftest goes together with the fix of the bug[1] itself. [1] https://lore.kernel.org/all/20231001005659.2185316-1-riel@surriel.com/#r Link: https://lkml.kernel.org/r/20231005163922.87568-3-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Rik van Riel Tested-by: Rik van Riel Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/hugetlb_fault_after_madv.c | 73 ++++++++++++++++++++++ tools/testing/selftests/mm/run_vmtests.sh | 4 ++ 3 files changed, 78 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb_fault_after_madv.c diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 2a89989afafc..78dfec8bc676 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -69,6 +69,7 @@ TEST_GEN_FILES += split_huge_page_test TEST_GEN_FILES += ksm_tests TEST_GEN_FILES += ksm_functional_tests TEST_GEN_FILES += mdwe_test +TEST_GEN_FILES += hugetlb_fault_after_madv ifneq ($(ARCH),arm64) TEST_GEN_PROGS += soft-dirty diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c new file mode 100644 index 000000000000..73b81c632366 --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -0,0 +1,73 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include + +#include "vm_util.h" +#include "../kselftest.h" + +#define MMAP_SIZE (1 << 21) +#define INLOOP_ITER 100 + +char *huge_ptr; + +/* Touch the memory while it is being madvised() */ +void *touch(void *unused) +{ + char *ptr = (char *)huge_ptr; + + for (int i = 0; i < INLOOP_ITER; i++) + ptr[0] = '.'; + + return NULL; +} + +void *madv(void *unused) +{ + usleep(rand() % 10); + + for (int i = 0; i < INLOOP_ITER; i++) + madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + + return NULL; +} + +int main(void) +{ + unsigned long free_hugepages; + pthread_t thread1, thread2; + /* + * On kernel 6.4, we are able to reproduce the problem with ~1000 + * interactions + */ + int max = 10000; + + srand(getpid()); + + free_hugepages = get_free_hugepages(); + if (free_hugepages != 1) { + ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", + free_hugepages); + } + + while (max--) { + huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + + if ((unsigned long)huge_ptr == -1) + ksft_exit_skip("Failed to allocated huge page\n"); + + pthread_create(&thread1, NULL, madv, NULL); + pthread_create(&thread2, NULL, touch, NULL); + + pthread_join(thread1, NULL); + pthread_join(thread2, NULL); + munmap(huge_ptr, MMAP_SIZE); + } + + return KSFT_PASS; +} diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 45941ad5f658..bf4c4cd46600 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -223,6 +223,10 @@ CATEGORY="hugetlb" run_test ./hugepage-mremap CATEGORY="hugetlb" run_test ./hugepage-vmemmap CATEGORY="hugetlb" run_test ./hugetlb-madvise +# For this test, we need one and just one huge page +echo 1 > /proc/sys/vm/nr_hugepages +CATEGORY="hugetlb" run_test ./hugetlb_fault_after_madv + if test_selected "hugetlb"; then echo "NOTE: These hugetlb tests provide minimal coverage. Use" echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" -- cgit v1.2.3 From 279d5fc3227f04ef2c6125e5c440e7952173a89a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:01 +0100 Subject: iomap: hold state_lock over call to ifs_set_range_uptodate() Patch series "Add folio_end_read", v2. The core of this patchset is the new folio_end_read() call which filesystems can use when finishing a page cache read instead of separate calls to mark the folio uptodate and unlock it. As an illustration of its use, I converted ext4, iomap & mpage; more can be converted. I think that's useful by itself, but the interesting optimisation is that we can implement that with a single XOR instruction that sets the uptodate bit, clears the lock bit, tests the waiter bit and provides a write memory barrier. That removes one memory barrier and one atomic instruction from each page read, which seems worth doing. That's in patch 15. The last two patches could be a separate series, but basically we can do the same thing with the writeback flag that we do with the unlock flag; clear it and test the waiters bit at the same time. This patch (of 17): This is really preparation for the next patch, but it lets us call folio_mark_uptodate() in just one place instead of two. Link: https://lkml.kernel.org/r/20231004165317.1061855-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231004165317.1061855-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Christophe Leroy Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 5db54ca29a35..6e780ca64ce3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -57,30 +57,32 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, return test_bit(block, ifs->state); } -static void ifs_set_range_uptodate(struct folio *folio, +static bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int first_blk = off >> inode->i_blkbits; unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; - unsigned long flags; - spin_lock_irqsave(&ifs->state_lock, flags); bitmap_set(ifs->state, first_blk, nr_blks); - if (ifs_is_fully_uptodate(folio, ifs)) - folio_mark_uptodate(folio); - spin_unlock_irqrestore(&ifs->state_lock, flags); + return ifs_is_fully_uptodate(folio, ifs); } static void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; + unsigned long flags; + bool uptodate = true; - if (ifs) - ifs_set_range_uptodate(folio, ifs, off, len); - else + if (ifs) { + spin_lock_irqsave(&ifs->state_lock, flags); + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + spin_unlock_irqrestore(&ifs->state_lock, flags); + } + + if (uptodate) folio_mark_uptodate(folio); } -- cgit v1.2.3 From f45b494e2a24d86afd79cab7c343b414c5213447 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:02 +0100 Subject: iomap: protect read_bytes_pending with the state_lock Perform one atomic operation (acquiring the spinlock) instead of two (spinlock & atomic_sub) per read completion. Link: https://lkml.kernel.org/r/20231004165317.1061855-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6e780ca64ce3..4a996c5327ef 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -29,9 +29,9 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); * and I/O completions. */ struct iomap_folio_state { - atomic_t read_bytes_pending; - atomic_t write_bytes_pending; spinlock_t state_lock; + unsigned int read_bytes_pending; + atomic_t write_bytes_pending; /* * Each block has two bits in this bitmap: @@ -183,7 +183,7 @@ static void ifs_free(struct folio *folio) if (!ifs) return; - WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending)); + WARN_ON_ONCE(ifs->read_bytes_pending != 0); WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); @@ -250,19 +250,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } -static void iomap_finish_folio_read(struct folio *folio, size_t offset, +static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { struct iomap_folio_state *ifs = folio->private; + bool uptodate = !error; + bool finished = true; - if (unlikely(error)) { - folio_clear_uptodate(folio); - folio_set_error(folio); - } else { - iomap_set_range_uptodate(folio, offset, len); + if (ifs) { + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + if (!error) + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + ifs->read_bytes_pending -= len; + finished = !ifs->read_bytes_pending; + spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending)) + if (error) + folio_set_error(folio); + if (uptodate) + folio_mark_uptodate(folio); + if (finished) folio_unlock(folio); } @@ -360,8 +370,11 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, } ctx->cur_folio_in_bio = true; - if (ifs) - atomic_add(plen, &ifs->read_bytes_pending); + if (ifs) { + spin_lock_irq(&ifs->state_lock); + ifs->read_bytes_pending += plen; + spin_unlock_irq(&ifs->state_lock); + } sector = iomap_sector(iomap, pos); if (!ctx->bio || -- cgit v1.2.3 From 0b237047d5a72ffe06c0bdf2f4536f669dcd31c9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:03 +0100 Subject: mm: add folio_end_read() Provide a function for filesystems to call when they have finished reading an entire folio. Link: https://lkml.kernel.org/r/20231004165317.1061855-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + mm/filemap.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 759b29d9a69a..bcc1ea44b4e8 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1101,6 +1101,7 @@ static inline void wait_on_page_locked(struct page *page) folio_wait_locked(page_folio(page)); } +void folio_end_read(struct folio *folio, bool success); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index cd872fbc6086..1ce78c619294 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1525,6 +1525,28 @@ void folio_unlock(struct folio *folio) } EXPORT_SYMBOL(folio_unlock); +/** + * folio_end_read - End read on a folio. + * @folio: The folio. + * @success: True if all reads completed successfully. + * + * When all reads against a folio have completed, filesystems should + * call this function to let the pagecache know that no more reads + * are outstanding. This will unlock the folio and wake up any thread + * sleeping on the lock. The folio will also be marked uptodate if all + * reads succeeded. + * + * Context: May be called from interrupt or process context. May not be + * called from NMI context. + */ +void folio_end_read(struct folio *folio, bool success) +{ + if (likely(success)) + folio_mark_uptodate(folio); + folio_unlock(folio); +} +EXPORT_SYMBOL(folio_end_read); + /** * folio_end_private_2 - Clear PG_private_2 and wake any waiters. * @folio: The folio. -- cgit v1.2.3 From f8174a1181220d24d6b4332216112318f5905729 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:04 +0100 Subject: ext4: use folio_end_read() folio_end_read() is the perfect fit for ext4. Link: https://lkml.kernel.org/r/20231004165317.1061855-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/ext4/readpage.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3e7d160f543f..21e8f0aebb3c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - struct folio *folio = fi.folio; - - if (bio->bi_status) - folio_clear_uptodate(folio); - else - folio_mark_uptodate(folio); - folio_unlock(folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); @@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, true); continue; } } else if (fully_mapped) { -- cgit v1.2.3 From 6ba924d341c24027d95352ae8802c9cd1c308559 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:05 +0100 Subject: buffer: use folio_end_read() There are two places that we can use this new helper. Link: https://lkml.kernel.org/r/20231004165317.1061855-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index edec8652788c..86ddfa52c699 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } while (tmp != bh); spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If all of the buffers are uptodate then we can set the page - * uptodate. - */ - if (folio_uptodate) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, folio_uptodate); return; still_busy: @@ -2433,12 +2427,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (!nr) { /* - * All buffers are uptodate - we can set the folio uptodate - * as well. But not if get_block() returned an error. + * All buffers are uptodate or get_block() returned an + * error when trying to map them - we can finish the read. */ - if (!page_error) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, !page_error); return 0; } -- cgit v1.2.3 From 7a4847e54cc1889d109ce2a6ebed19aafc4a4af8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:06 +0100 Subject: iomap: use folio_end_read() Combine the setting of the uptodate flag with the clearing of the locked flag. Link: https://lkml.kernel.org/r/20231004165317.1061855-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4a996c5327ef..5d19a2b47b6a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -270,10 +270,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (error) folio_set_error(folio); - if (uptodate) - folio_mark_uptodate(folio); if (finished) - folio_unlock(folio); + folio_end_read(folio, uptodate); } static void iomap_read_end_io(struct bio *bio) -- cgit v1.2.3 From 247dbcdbf790c52fc76cf8e327cd0a5778e41e66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:07 +0100 Subject: bitops: add xor_unlock_is_negative_byte() Replace clear_bit_and_unlock_is_negative_byte() with xor_unlock_is_negative_byte(). We have a few places that like to lock a folio, set a flag and unlock it again. Allow for the possibility of combining the latter two operations for efficiency. We are guaranteed that the caller holds the lock, so it is safe to unlock it with the xor. The caller must guarantee that nobody else will set the flag without holding the lock; it is not safe to do this with the PG_dirty flag, for example. Link: https://lkml.kernel.org/r/20231004165317.1061855-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/bitops.h | 17 +++++----------- arch/x86/include/asm/bitops.h | 11 +++++------ include/asm-generic/bitops/instrumented-lock.h | 27 ++++++++++++++------------ include/asm-generic/bitops/lock.h | 21 +++++--------------- kernel/kcsan/kcsan_test.c | 8 ++++---- kernel/kcsan/selftest.c | 8 ++++---- mm/filemap.c | 5 +++++ mm/kasan/kasan_test.c | 7 ++++--- 8 files changed, 47 insertions(+), 57 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 7e0f0322912b..40cc3ded60cb 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -234,32 +234,25 @@ static inline int arch_test_and_change_bit(unsigned long nr, } #ifdef CONFIG_PPC64 -static inline unsigned long -clear_bit_unlock_return_word(int nr, volatile unsigned long *addr) +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) { unsigned long old, t; - unsigned long *p = (unsigned long *)addr + BIT_WORD(nr); - unsigned long mask = BIT_MASK(nr); __asm__ __volatile__ ( PPC_RELEASE_BARRIER "1:" PPC_LLARX "%0,0,%3,0\n" - "andc %1,%0,%2\n" + "xor %1,%0,%2\n" PPC_STLCX "%1,0,%3\n" "bne- 1b\n" : "=&r" (old), "=&r" (t) : "r" (mask), "r" (p) : "cc", "memory"); - return old; + return (old & BIT_MASK(7)) != 0; } -/* - * This is a special function for mm/filemap.c - * Bit 7 corresponds to PG_waiters. - */ -#define arch_clear_bit_unlock_is_negative_byte(nr, addr) \ - (clear_bit_unlock_return_word(nr, addr) & BIT_MASK(7)) +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif /* CONFIG_PPC64 */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 2edf68475fec..f03c0a50ec3a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -94,18 +94,17 @@ arch___clear_bit(unsigned long nr, volatile unsigned long *addr) asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } -static __always_inline bool -arch_clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1" + asm volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) - : "ir" ((char) ~(1 << nr)) : "memory"); + : "iq" ((char)mask) : "memory"); return negative; } -#define arch_clear_bit_unlock_is_negative_byte \ - arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h index eb64bd4f11f3..e8ea3aeda9a9 100644 --- a/include/asm-generic/bitops/instrumented-lock.h +++ b/include/asm-generic/bitops/instrumented-lock.h @@ -58,27 +58,30 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) return arch_test_and_set_bit_lock(nr, addr); } -#if defined(arch_clear_bit_unlock_is_negative_byte) +#if defined(arch_xor_unlock_is_negative_byte) /** - * clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom - * byte is negative, for unlock. - * @nr: the bit to clear - * @addr: the address to start counting from + * xor_unlock_is_negative_byte - XOR a single byte in memory and test if + * it is negative, for unlock. + * @mask: Change the bits which are set in this mask. + * @addr: The address of the word containing the byte to change. * + * Changes some of bits 0-6 in the word pointed to by @addr. * This operation is atomic and provides release barrier semantics. + * Used to optimise some folio operations which are commonly paired + * with an unlock or end of writeback. Bit 7 is used as PG_waiters to + * indicate whether anybody is waiting for the unlock. * - * This is a bit of a one-trick-pony for the filemap code, which clears - * PG_locked and tests PG_waiters, + * Return: Whether the top bit of the byte is set. */ -static inline bool -clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) { kcsan_release(); - instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long)); - return arch_clear_bit_unlock_is_negative_byte(nr, addr); + instrument_atomic_write(addr, sizeof(long)); + return arch_xor_unlock_is_negative_byte(mask, addr); } /* Let everybody know we have it. */ -#define clear_bit_unlock_is_negative_byte clear_bit_unlock_is_negative_byte +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #endif #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */ diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 40913516e654..6a638e89d130 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -66,27 +66,16 @@ arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p) raw_atomic_long_set_release((atomic_long_t *)p, old); } -/** - * arch_clear_bit_unlock_is_negative_byte - Clear a bit in memory and test if bottom - * byte is negative, for unlock. - * @nr: the bit to clear - * @addr: the address to start counting from - * - * This is a bit of a one-trick-pony for the filemap code, which clears - * PG_locked and tests PG_waiters, - */ -#ifndef arch_clear_bit_unlock_is_negative_byte -static inline bool arch_clear_bit_unlock_is_negative_byte(unsigned int nr, - volatile unsigned long *p) +#ifndef arch_xor_unlock_is_negative_byte +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) { long old; - unsigned long mask = BIT_MASK(nr); - p += BIT_WORD(nr); - old = raw_atomic_long_fetch_andnot_release(mask, (atomic_long_t *)p); + old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p); return !!(old & BIT(7)); } -#define arch_clear_bit_unlock_is_negative_byte arch_clear_bit_unlock_is_negative_byte +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif #include diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 0ddbdab5903d..1333d23ac4ef 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -700,10 +700,10 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); - KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true); +#ifdef xor_unlock_is_negative_byte + KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); + KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); #endif kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 8679322450f2..619be7417420 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -228,10 +228,10 @@ static bool __init test_barrier(void) spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); -#ifdef clear_bit_unlock_is_negative_byte - KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); - KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var)); +#ifdef xor_unlock_is_negative_byte + KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); + KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); #endif kcsan_nestable_atomic_end(); diff --git a/mm/filemap.c b/mm/filemap.c index 1ce78c619294..c637863f4643 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,6 +1482,11 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); +#ifdef xor_unlock_is_negative_byte +#define clear_bit_unlock_is_negative_byte(nr, p) \ + xor_unlock_is_negative_byte(1 << nr, p) +#endif + #ifndef clear_bit_unlock_is_negative_byte /* diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 2030c7ff7de9..821c9ea0473a 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1099,9 +1099,10 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); -#if defined(clear_bit_unlock_is_negative_byte) - KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = - clear_bit_unlock_is_negative_byte(nr, addr)); +#if defined(xor_unlock_is_negative_byte) + if (nr < 7) + KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = + xor_unlock_is_negative_byte(1 << nr, addr)); #endif } -- cgit v1.2.3 From e28ff5dc8cf6aec042741f1ea62089dca6a894ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:08 +0100 Subject: alpha: implement xor_unlock_is_negative_byte Inspired by the alpha clear_bit() and arch_atomic_add_return(), this will surely be more efficient than the generic one defined in filemap.c. Link: https://lkml.kernel.org/r/20231004165317.1061855-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/alpha/include/asm/bitops.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index bafb1c1f0fdc..b50ad6b83e85 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -286,6 +286,27 @@ arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) #define arch_test_bit generic_test_bit #define arch_test_bit_acquire generic_test_bit_acquire +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) +{ + unsigned long temp, old; + + __asm__ __volatile__( + "1: ldl_l %0,%4\n" + " mov %0,%2\n" + " xor %0,%3,%0\n" + " stl_c %0,%1\n" + " beq %0,2f\n" + ".subsection 2\n" + "2: br 1b\n" + ".previous" + :"=&r" (temp), "=m" (*p), "=&r" (old) + :"Ir" (mask), "m" (*p)); + + return (old & BIT(7)) != 0; +} +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. -- cgit v1.2.3 From ea845e3173f7552aae539aeb943cd19ebe90ba38 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:09 +0100 Subject: m68k: implement xor_unlock_is_negative_byte Using EOR to clear the guaranteed-to-be-set lock bit will test the negative flag just like the x86 implementation. This should be more efficient than the generic implementation in filemap.c. It would be better if m68k had __GCC_ASM_FLAG_OUTPUTS__. Coldfire doesn't have a byte-sized EOR, so we test bit 7 after the EOR, which is a second memory access, but it's slightly better than the current C code. Link: https://lkml.kernel.org/r/20231004165317.1061855-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/m68k/include/asm/bitops.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index e984af71df6b..80ee36095905 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -319,6 +319,28 @@ arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) return test_and_change_bit(nr, addr); } +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) +{ +#ifdef CONFIG_COLDFIRE + __asm__ __volatile__ ("eorl %1, %0" + : "+m" (*p) + : "d" (mask) + : "memory"); + return *p & (1 << 7); +#else + char result; + char *cp = (char *)p + 3; /* m68k is big-endian */ + + __asm__ __volatile__ ("eor.b %1, %2; smi %0" + : "=d" (result) + : "di" (mask), "o" (*cp) + : "memory"); + return result; +#endif +} +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte + /* * The true 68020 and more advanced processors support the "bfffo" * instruction for finding bits. ColdFire and simple 68000 parts -- cgit v1.2.3 From 8da36b26e3d8640364a9e60e0b5c3fa3f55d298b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:10 +0100 Subject: mips: implement xor_unlock_is_negative_byte Inspired by the mips test_and_change_bit(), this will surely be more efficient than the generic one defined in filemap.c Link: https://lkml.kernel.org/r/20231004165317.1061855-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/mips/include/asm/bitops.h | 26 +++++++++++++++++++++++++- arch/mips/lib/bitops.c | 14 ++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index b4bf754f7db3..d98a05c478f4 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -73,7 +73,8 @@ int __mips_test_and_clear_bit(unsigned long nr, volatile unsigned long *addr); int __mips_test_and_change_bit(unsigned long nr, volatile unsigned long *addr); - +bool __mips_xor_is_negative_byte(unsigned long mask, + volatile unsigned long *addr); /* * set_bit - Atomically set a bit in memory @@ -279,6 +280,29 @@ static inline int test_and_change_bit(unsigned long nr, return res; } +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *p) +{ + unsigned long orig; + bool res; + + smp_mb__before_atomic(); + + if (!kernel_uses_llsc) { + res = __mips_xor_is_negative_byte(mask, p); + } else { + orig = __test_bit_op(*p, "%0", + "xor\t%1, %0, %3", + "ir"(mask)); + res = (orig & BIT(7)) != 0; + } + + smp_llsc_mb(); + + return res; +} +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte + #undef __bit_op #undef __test_bit_op diff --git a/arch/mips/lib/bitops.c b/arch/mips/lib/bitops.c index 116d0bd8b2ae..00aee98e9d54 100644 --- a/arch/mips/lib/bitops.c +++ b/arch/mips/lib/bitops.c @@ -146,3 +146,17 @@ int __mips_test_and_change_bit(unsigned long nr, volatile unsigned long *addr) return res; } EXPORT_SYMBOL(__mips_test_and_change_bit); + +bool __mips_xor_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) +{ + unsigned long flags; + unsigned long data; + + raw_local_irq_save(flags); + data = *addr; + *addr = data ^ mask; + raw_local_irq_restore(flags); + + return (data & BIT(7)) != 0; +} -- cgit v1.2.3 From 51a752c28bcf901618bbc25a43f84ef539f9e682 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:11 +0100 Subject: powerpc: implement arch_xor_unlock_is_negative_byte on 32-bit Simply remove the ifdef. The assembly is identical to that in the non-optimised case of test_and_clear_bits() on PPC32, and it's not clear to me how the PPC32 optimisation works, nor whether it would work for arch_xor_unlock_is_negative_byte(). If that optimisation would work, someone can implement it later, but this is more efficient than the implementation in filemap.c. Link: https://lkml.kernel.org/r/20231004165317.1061855-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/bitops.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 40cc3ded60cb..671ecc6711e3 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -233,7 +233,6 @@ static inline int arch_test_and_change_bit(unsigned long nr, return test_and_change_bits(BIT_MASK(nr), addr + BIT_WORD(nr)) != 0; } -#ifdef CONFIG_PPC64 static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *p) { @@ -251,11 +250,8 @@ static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, return (old & BIT_MASK(7)) != 0; } - #define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte -#endif /* CONFIG_PPC64 */ - #include static inline void arch___clear_bit_unlock(int nr, volatile unsigned long *addr) -- cgit v1.2.3 From 2a667285b53c58d72f8bdb736c040f0f36bff58a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:12 +0100 Subject: riscv: implement xor_unlock_is_negative_byte Inspired by the riscv clear_bit_unlock(), this will surely be more efficient than the generic one defined in filemap.c. Link: https://lkml.kernel.org/r/20231004165317.1061855-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/riscv/include/asm/bitops.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 3540b690944b..15e3044298a2 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -191,6 +191,19 @@ static inline void __clear_bit_unlock( clear_bit_unlock(nr, addr); } +static inline bool xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *addr) +{ + unsigned long res; + __asm__ __volatile__ ( + __AMO(xor) ".rl %0, %2, %1" + : "=r" (res), "+A" (*addr) + : "r" (__NOP(mask)) + : "memory"); + return (res & BIT(7)) != 0; +} +#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte + #undef __test_and_op_bit #undef __op_bit #undef __NOP -- cgit v1.2.3 From 12010aa89f8705c8bacddc5c2276cc80badeac56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:13 +0100 Subject: s390: implement arch_xor_unlock_is_negative_byte Inspired by the s390 arch_test_and_clear_bit(), this will surely be more efficient than the generic one defined in filemap.c. Link: https://lkml.kernel.org/r/20231004165317.1061855-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/s390/include/asm/bitops.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 2de74fcd0578..c467dffa8c12 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -201,6 +201,16 @@ static inline void arch___clear_bit_unlock(unsigned long nr, arch___clear_bit(nr, ptr); } +static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, + volatile unsigned long *ptr) +{ + unsigned long old; + + old = __atomic64_xor_barrier(mask, (long *)ptr); + return old & BIT(7); +} +#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte + #include #include #include -- cgit v1.2.3 From f12fb73b74fd23ca33e3f95fb996f295eeae1da7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:14 +0100 Subject: mm: delete checks for xor_unlock_is_negative_byte() Architectures which don't define their own use the one in asm-generic/bitops/lock.h. Get rid of all the ifdefs around "maybe we don't have it". Link: https://lkml.kernel.org/r/20231004165317.1061855-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Geert Uytterhoeven Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- arch/alpha/include/asm/bitops.h | 1 - arch/m68k/include/asm/bitops.h | 1 - arch/mips/include/asm/bitops.h | 1 - arch/riscv/include/asm/bitops.h | 1 - include/asm-generic/bitops/instrumented-lock.h | 5 ----- include/asm-generic/bitops/lock.h | 1 - kernel/kcsan/kcsan_test.c | 3 --- kernel/kcsan/selftest.c | 3 --- mm/filemap.c | 30 +------------------------- mm/kasan/kasan_test.c | 3 --- 10 files changed, 1 insertion(+), 48 deletions(-) diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h index b50ad6b83e85..3e33621922c3 100644 --- a/arch/alpha/include/asm/bitops.h +++ b/arch/alpha/include/asm/bitops.h @@ -305,7 +305,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return (old & BIT(7)) != 0; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte /* * ffz = Find First Zero in word. Undefined if no zero exists, diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h index 80ee36095905..14c64a6f1217 100644 --- a/arch/m68k/include/asm/bitops.h +++ b/arch/m68k/include/asm/bitops.h @@ -339,7 +339,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return result; #endif } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte /* * The true 68020 and more advanced processors support the "bfffo" diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h index d98a05c478f4..89f73d1a4ea4 100644 --- a/arch/mips/include/asm/bitops.h +++ b/arch/mips/include/asm/bitops.h @@ -301,7 +301,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, return res; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #undef __bit_op #undef __test_bit_op diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h index 15e3044298a2..65f6eee4ab8d 100644 --- a/arch/riscv/include/asm/bitops.h +++ b/arch/riscv/include/asm/bitops.h @@ -202,7 +202,6 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, : "memory"); return (res & BIT(7)) != 0; } -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte #undef __test_and_op_bit #undef __op_bit diff --git a/include/asm-generic/bitops/instrumented-lock.h b/include/asm-generic/bitops/instrumented-lock.h index e8ea3aeda9a9..542d3727ee4e 100644 --- a/include/asm-generic/bitops/instrumented-lock.h +++ b/include/asm-generic/bitops/instrumented-lock.h @@ -58,7 +58,6 @@ static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) return arch_test_and_set_bit_lock(nr, addr); } -#if defined(arch_xor_unlock_is_negative_byte) /** * xor_unlock_is_negative_byte - XOR a single byte in memory and test if * it is negative, for unlock. @@ -80,8 +79,4 @@ static inline bool xor_unlock_is_negative_byte(unsigned long mask, instrument_atomic_write(addr, sizeof(long)); return arch_xor_unlock_is_negative_byte(mask, addr); } -/* Let everybody know we have it. */ -#define xor_unlock_is_negative_byte xor_unlock_is_negative_byte -#endif - #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */ diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index 6a638e89d130..14d4ec8c5152 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -75,7 +75,6 @@ static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p); return !!(old & BIT(7)); } -#define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte #endif #include diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c index 1333d23ac4ef..015586217875 100644 --- a/kernel/kcsan/kcsan_test.c +++ b/kernel/kcsan/kcsan_test.c @@ -699,12 +699,9 @@ static void test_barrier_nothreads(struct kunit *test) KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true); KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false); KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true); - -#ifdef xor_unlock_is_negative_byte KCSAN_EXPECT_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); KCSAN_EXPECT_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); KCSAN_EXPECT_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var), true); -#endif kcsan_nestable_atomic_end(); } diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 619be7417420..84a1200271af 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -227,12 +227,9 @@ static bool __init test_barrier(void) KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock)); spin_lock(&test_spinlock); KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock)); - -#ifdef xor_unlock_is_negative_byte KCSAN_CHECK_RW_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); KCSAN_CHECK_READ_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); KCSAN_CHECK_WRITE_BARRIER(xor_unlock_is_negative_byte(1, &test_var)); -#endif kcsan_nestable_atomic_end(); return ret; diff --git a/mm/filemap.c b/mm/filemap.c index c637863f4643..458377e9a184 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1482,34 +1482,6 @@ void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter) } EXPORT_SYMBOL_GPL(folio_add_wait_queue); -#ifdef xor_unlock_is_negative_byte -#define clear_bit_unlock_is_negative_byte(nr, p) \ - xor_unlock_is_negative_byte(1 << nr, p) -#endif - -#ifndef clear_bit_unlock_is_negative_byte - -/* - * PG_waiters is the high bit in the same byte as PG_lock. - * - * On x86 (and on many other architectures), we can clear PG_lock and - * test the sign bit at the same time. But if the architecture does - * not support that special operation, we just do this all by hand - * instead. - * - * The read of PG_waiters has to be after (or concurrently with) PG_locked - * being cleared, but a memory barrier should be unnecessary since it is - * in the same byte as PG_locked. - */ -static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem) -{ - clear_bit_unlock(nr, mem); - /* smp_mb__after_atomic(); */ - return test_bit(PG_waiters, mem); -} - -#endif - /** * folio_unlock - Unlock a locked folio. * @folio: The folio. @@ -1525,7 +1497,7 @@ void folio_unlock(struct folio *folio) BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0))) + if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0))) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 821c9ea0473a..8281eb42464b 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -1098,12 +1098,9 @@ static void kasan_bitops_test_and_modify(struct kunit *test, int nr, void *addr) KUNIT_EXPECT_KASAN_FAIL(test, test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, __test_and_change_bit(nr, addr)); KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = test_bit(nr, addr)); - -#if defined(xor_unlock_is_negative_byte) if (nr < 7) KUNIT_EXPECT_KASAN_FAIL(test, kasan_int_result = xor_unlock_is_negative_byte(1 << nr, addr)); -#endif } static void kasan_bitops_generic(struct kunit *test) -- cgit v1.2.3 From 0410cd844ed0af3db3cb510d877d62c66d26e5cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:15 +0100 Subject: mm: add folio_xor_flags_has_waiters() Optimise folio_end_read() by setting the uptodate bit at the same time we clear the unlock bit. This saves at least one memory barrier and one write-after-write hazard. Link: https://lkml.kernel.org/r/20231004165317.1061855-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 19 +++++++++++++++++++ mm/filemap.c | 14 +++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5c02720c53a5..a88e64acebfe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -692,6 +692,25 @@ TESTPAGEFLAG_FALSE(Ksm, ksm) u64 stable_page_flags(struct page *page); +/** + * folio_xor_flags_has_waiters - Change some folio flags. + * @folio: The folio. + * @mask: Bits set in this word will be changed. + * + * This must only be used for flags which are changed with the folio + * lock held. For example, it is unsafe to use for PG_dirty as that + * can be set without the folio lock held. It can also only be used + * on flags which are in the range 0-6 as some of the implementations + * only affect those bits. + * + * Return: Whether there are tasks waiting on the folio. + */ +static inline bool folio_xor_flags_has_waiters(struct folio *folio, + unsigned long mask) +{ + return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0)); +} + /** * folio_test_uptodate - Is this folio up to date? * @folio: The folio. diff --git a/mm/filemap.c b/mm/filemap.c index 458377e9a184..e9c636f57777 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1497,7 +1497,7 @@ void folio_unlock(struct folio *folio) BUILD_BUG_ON(PG_waiters != 7); BUILD_BUG_ON(PG_locked > 7); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - if (xor_unlock_is_negative_byte(1 << PG_locked, folio_flags(folio, 0))) + if (folio_xor_flags_has_waiters(folio, 1 << PG_locked)) folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_unlock); @@ -1518,9 +1518,17 @@ EXPORT_SYMBOL(folio_unlock); */ void folio_end_read(struct folio *folio, bool success) { + unsigned long mask = 1 << PG_locked; + + /* Must be in bottom byte for x86 to work */ + BUILD_BUG_ON(PG_uptodate > 7); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); + if (likely(success)) - folio_mark_uptodate(folio); - folio_unlock(folio); + mask |= 1 << PG_uptodate; + if (folio_xor_flags_has_waiters(folio, mask)) + folio_wake_bit(folio, PG_locked); } EXPORT_SYMBOL(folio_end_read); -- cgit v1.2.3 From 7d0795d098a127508f3e29ab4257c9ab598efaea Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:16 +0100 Subject: mm: make __end_folio_writeback() return void Rather than check the result of test-and-clear, just check that we have the writeback bit set at the start. This wouldn't catch every case, but it's good enough (and enables the next patch). Link: https://lkml.kernel.org/r/20231004165317.1061855-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++++++-- mm/internal.h | 2 +- mm/page-writeback.c | 38 ++++++++++++++++---------------------- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index e9c636f57777..523d9e15b3b0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1593,9 +1593,15 @@ EXPORT_SYMBOL(folio_wait_private_2_killable); /** * folio_end_writeback - End writeback against a folio. * @folio: The folio. + * + * The folio must actually be under writeback. + * + * Context: May be called from process or interrupt context. */ void folio_end_writeback(struct folio *folio) { + VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio); + /* * folio_test_clear_reclaim() could be used here but it is an * atomic operation and overkill in this particular case. Failing @@ -1615,8 +1621,7 @@ void folio_end_writeback(struct folio *folio) * reused before the folio_wake(). */ folio_get(folio); - if (!__folio_end_writeback(folio)) - BUG(); + __folio_end_writeback(folio); smp_mb__after_atomic(); folio_wake(folio, PG_writeback); diff --git a/mm/internal.h b/mm/internal.h index 2b79da9deb64..ac4bbe2e2ca6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -105,7 +105,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); -bool __folio_end_writeback(struct folio *folio); +void __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 001adbb4a180..a37e4c33d1ab 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2940,11 +2940,10 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) spin_unlock_irqrestore(&wb->work_lock, flags); } -bool __folio_end_writeback(struct folio *folio) +void __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); - bool ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { @@ -2953,19 +2952,16 @@ bool __folio_end_writeback(struct folio *folio) unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - ret = folio_test_clear_writeback(folio); - if (ret) { - __xa_clear_mark(&mapping->i_pages, folio_index(folio), - PAGECACHE_TAG_WRITEBACK); - if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { - struct bdi_writeback *wb = inode_to_wb(inode); - - wb_stat_mod(wb, WB_WRITEBACK, -nr); - __wb_writeout_add(wb, nr); - if (!mapping_tagged(mapping, - PAGECACHE_TAG_WRITEBACK)) - wb_inode_writeback_end(wb); - } + folio_test_clear_writeback(folio); + __xa_clear_mark(&mapping->i_pages, folio_index(folio), + PAGECACHE_TAG_WRITEBACK); + if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { + struct bdi_writeback *wb = inode_to_wb(inode); + + wb_stat_mod(wb, WB_WRITEBACK, -nr); + __wb_writeout_add(wb, nr); + if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) + wb_inode_writeback_end(wb); } if (mapping->host && !mapping_tagged(mapping, @@ -2974,15 +2970,13 @@ bool __folio_end_writeback(struct folio *folio) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - ret = folio_test_clear_writeback(folio); - } - if (ret) { - lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); - zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); - node_stat_mod_folio(folio, NR_WRITTEN, nr); + folio_test_clear_writeback(folio); } + + lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); + zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); + node_stat_mod_folio(folio, NR_WRITTEN, nr); folio_memcg_unlock(folio); - return ret; } bool __folio_start_writeback(struct folio *folio, bool keep_write) -- cgit v1.2.3 From 2580d554585c52a644839864ef9238af5b030ebc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:17 +0100 Subject: mm: use folio_xor_flags_has_waiters() in folio_end_writeback() Match how folio_unlock() works by combining the test for PG_waiters with the clearing of PG_writeback. This should have a small performance win, and removes the last user of folio_wake(). Link: https://lkml.kernel.org/r/20231004165317.1061855-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- mm/filemap.c | 15 +++------------ mm/internal.h | 2 +- mm/page-writeback.c | 9 ++++++--- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 523d9e15b3b0..b04ba896aac9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1175,13 +1175,6 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) spin_unlock_irqrestore(&q->lock, flags); } -static void folio_wake(struct folio *folio, int bit) -{ - if (!folio_test_waiters(folio)) - return; - folio_wake_bit(folio, bit); -} - /* * A choice of three behaviors for folio_wait_bit_common(): */ @@ -1618,13 +1611,11 @@ void folio_end_writeback(struct folio *folio) * Writeback does not hold a folio reference of its own, relying * on truncation to wait for the clearing of PG_writeback. * But here we must make sure that the folio is not freed and - * reused before the folio_wake(). + * reused before the folio_wake_bit(). */ folio_get(folio); - __folio_end_writeback(folio); - - smp_mb__after_atomic(); - folio_wake(folio, PG_writeback); + if (__folio_end_writeback(folio)) + folio_wake_bit(folio, PG_writeback); acct_reclaim_writeback(folio); folio_put(folio); } diff --git a/mm/internal.h b/mm/internal.h index ac4bbe2e2ca6..2b79da9deb64 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -105,7 +105,7 @@ static inline void wake_throttle_isolated(pg_data_t *pgdat) vm_fault_t do_swap_page(struct vm_fault *vmf); void folio_rotate_reclaimable(struct folio *folio); -void __folio_end_writeback(struct folio *folio); +bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a37e4c33d1ab..46f2f5d3d183 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2940,10 +2940,11 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb) spin_unlock_irqrestore(&wb->work_lock, flags); } -void __folio_end_writeback(struct folio *folio) +bool __folio_end_writeback(struct folio *folio) { long nr = folio_nr_pages(folio); struct address_space *mapping = folio_mapping(folio); + bool ret; folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { @@ -2952,7 +2953,7 @@ void __folio_end_writeback(struct folio *folio) unsigned long flags; xa_lock_irqsave(&mapping->i_pages, flags); - folio_test_clear_writeback(folio); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); __xa_clear_mark(&mapping->i_pages, folio_index(folio), PAGECACHE_TAG_WRITEBACK); if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) { @@ -2970,13 +2971,15 @@ void __folio_end_writeback(struct folio *folio) xa_unlock_irqrestore(&mapping->i_pages, flags); } else { - folio_test_clear_writeback(folio); + ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback); } lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); folio_memcg_unlock(folio); + + return ret; } bool __folio_start_writeback(struct folio *folio, bool keep_write) -- cgit v1.2.3 From 59838b2566f6d03099743675a2e0f425813078c6 Mon Sep 17 00:00:00 2001 From: Frank van der Linden Date: Wed, 4 Oct 2023 15:32:48 +0000 Subject: mm, hugetlb: remove HUGETLB_CGROUP_MIN_ORDER Originally, hugetlb_cgroup was the only hugetlb user of tail page structure fields. So, the code defined and checked against HUGETLB_CGROUP_MIN_ORDER to make sure pages weren't too small to use. However, by now, tail page #2 is used to store hugetlb hwpoison and subpool information as well. In other words, without that tail page hugetlb doesn't work. Acknowledge this fact by getting rid of HUGETLB_CGROUP_MIN_ORDER and checks against it. Instead, just check for the minimum viable page order at hstate creation time. Link: https://lkml.kernel.org/r/20231004153248.3842997-1-fvdl@google.com Signed-off-by: Frank van der Linden Reviewed-by: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb_cgroup.h | 11 ----------- mm/hugetlb.c | 2 +- mm/hugetlb_cgroup.c | 20 ++------------------ 3 files changed, 3 insertions(+), 30 deletions(-) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 3d82d91f49ac..e5d64b8b59c2 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -22,13 +22,6 @@ struct resv_map; struct file_region; #ifdef CONFIG_CGROUP_HUGETLB -/* - * Minimum page order trackable by hugetlb cgroup. - * At least 3 pages are necessary for all the tracking information. - * The second tail page contains all of the hugetlb-specific fields. - */ -#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__NR_USED_SUBPAGE) - enum hugetlb_memory_event { HUGETLB_MAX, HUGETLB_NR_MEMORY_EVENTS, @@ -68,8 +61,6 @@ static inline struct hugetlb_cgroup * __hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) - return NULL; if (rsvd) return folio->_hugetlb_cgroup_rsvd; else @@ -91,8 +82,6 @@ static inline void __set_hugetlb_cgroup(struct folio *folio, struct hugetlb_cgroup *h_cg, bool rsvd) { VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio); - if (folio_order(folio) < HUGETLB_CGROUP_MIN_ORDER) - return; if (rsvd) folio->_hugetlb_cgroup_rsvd = h_cg; else diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7ad9d2159da4..e2b1c417b90a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4361,7 +4361,7 @@ void __init hugetlb_add_hstate(unsigned int order) return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); + BUG_ON(order < order_base_2(__NR_USED_SUBPAGE)); h = &hstates[hugetlb_max_hstate++]; mutex_init(&h->resize_lock); h->order = order; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index dedd2edb076e..aa4486bd3904 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -262,12 +262,6 @@ static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) goto done; - /* - * We don't charge any cgroup if the compound page have less - * than 3 pages. - */ - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); @@ -397,9 +391,6 @@ static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) - return; - page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); @@ -869,15 +860,8 @@ void __init hugetlb_cgroup_file_init(void) { struct hstate *h; - for_each_hstate(h) { - /* - * Add cgroup control files only if the huge page consists - * of more than two normal pages. This is because we use - * page[2].private for storing cgroup details. - */ - if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER) - __hugetlb_cgroup_file_init(hstate_index(h)); - } + for_each_hstate(h) + __hugetlb_cgroup_file_init(hstate_index(h)); } /* -- cgit v1.2.3 From 4b569387c0d566db288e7c3e1b484b43df797bdb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:26 -0700 Subject: memcontrol: add helpers for hugetlb memcg accounting Patch series "hugetlb memcg accounting", v4. Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etcetera fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch series rectifies this issue by charging the memcg when the hugetlb folio is allocated, and uncharging when the folio is freed. In addition, a new selftest is added to demonstrate and verify this new behavior. This patch (of 4): This patch exposes charge committing and cancelling as parts of the memory controller interface. These functionalities are useful when the try_charge() and commit_charge() stages have to be separated by other actions in between (which can fail). One such example is the new hugetlb accounting behavior in the following patch. The patch also adds a helper function to obtain a reference to the current task's memcg. Link: https://lkml.kernel.org/r/20231006184629.155543-1-nphamcs@gmail.com Link: https://lkml.kernel.org/r/20231006184629.155543-2-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 +++++++++++++++++ mm/memcontrol.c | 59 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c6029aeaa268..bea3d997b076 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -652,6 +652,8 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, page_counter_read(&memcg->memory); } +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg); + int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp); /** @@ -703,6 +705,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) __mem_cgroup_uncharge_list(page_list); } +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); + void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -759,6 +763,8 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); +struct mem_cgroup *get_mem_cgroup_from_current(void); + struct lruvec *folio_lruvec_lock(struct folio *folio); struct lruvec *folio_lruvec_lock_irq(struct folio *folio); struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio, @@ -1239,6 +1245,11 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *target, return false; } +static inline void mem_cgroup_commit_charge(struct folio *folio, + struct mem_cgroup *memcg) +{ +} + static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) { @@ -1263,6 +1274,11 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } +static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, + unsigned int nr_pages) +{ +} + static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } @@ -1300,6 +1316,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + return NULL; +} + static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8539f2037168..b3e40a59f732 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1099,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void) return false; } +/** + * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. + */ +struct mem_cgroup *get_mem_cgroup_from_current(void) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return NULL; + +again: + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + if (!css_tryget(&memcg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + return memcg; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -2873,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return try_charge_memcg(memcg, gfp_mask, nr_pages); } -static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) +/** + * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call. + * @memcg: memcg previously charged. + * @nr_pages: number of pages previously charged. + */ +void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (mem_cgroup_is_root(memcg)) return; @@ -2898,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } +/** + * mem_cgroup_commit_charge - commit a previously successful try_charge(). + * @folio: folio to commit the charge to. + * @memcg: memcg previously charged. + */ +void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg) +{ + css_get(&memcg->css); + commit_charge(folio, memcg); + + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio)); + memcg_check_events(memcg, folio_nid(folio)); + local_irq_enable(); +} + #ifdef CONFIG_MEMCG_KMEM /* * The allocated objcg pointers array is not accounted directly. @@ -6116,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - cancel_charge(mc.to, mc.precharge); + mem_cgroup_cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6124,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - cancel_charge(mc.from, mc.moved_charge); + mem_cgroup_cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ @@ -7031,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, gfp_t gfp) { - long nr_pages = folio_nr_pages(folio); int ret; - ret = try_charge(memcg, gfp, nr_pages); + ret = try_charge(memcg, gfp, folio_nr_pages(folio)); if (ret) goto out; - css_get(&memcg->css); - commit_charge(folio, memcg); - - local_irq_disable(); - mem_cgroup_charge_statistics(memcg, nr_pages); - memcg_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + mem_cgroup_commit_charge(folio, memcg); out: return ret; } -- cgit v1.2.3 From 85ce2c517ade0d51b7ad95f2e88be9bbe294379a Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:27 -0700 Subject: memcontrol: only transfer the memcg data for migration For most migration use cases, only transfer the memcg data from the old folio to the new folio, and clear the old folio's memcg data. No charging and uncharging will be done. This shaves off some work on the migration path, and avoids the temporary double charging of a folio during its migration. The only exception is replace_page_cache_folio(), which will use the old mem_cgroup_migrate() (now renamed to mem_cgroup_replace_folio). In that context, the isolation of the old page isn't quite as thorough as with migration, so we cannot use our new implementation directly. This patch is the result of the following discussion on the new hugetlb memcg accounting behavior: https://lore.kernel.org/lkml/20231003171329.GB314430@monkey/ Link: https://lkml.kernel.org/r/20231006184629.155543-3-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 +++++++ mm/filemap.c | 2 +- mm/memcontrol.c | 40 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 45 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bea3d997b076..a3fce570122e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -707,6 +707,8 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages); +void mem_cgroup_replace_folio(struct folio *old, struct folio *new); + void mem_cgroup_migrate(struct folio *old, struct folio *new); /** @@ -1279,6 +1281,11 @@ static inline void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, { } +static inline void mem_cgroup_replace_folio(struct folio *old, + struct folio *new) +{ +} + static inline void mem_cgroup_migrate(struct folio *old, struct folio *new) { } diff --git a/mm/filemap.c b/mm/filemap.c index b04ba896aac9..48cd16c54e86 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -816,7 +816,7 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(old, new); + mem_cgroup_replace_folio(old, new); xas_lock_irq(&xas); xas_store(&xas, new); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b3e40a59f732..135d6637afe5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7292,16 +7292,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_migrate - Charge a folio's replacement. + * mem_cgroup_replace_folio - Charge a folio's replacement. * @old: Currently circulating folio. * @new: Replacement folio. * * Charge @new as a replacement folio for @old. @old will - * be uncharged upon free. + * be uncharged upon free. This is only used by the page cache + * (in replace_page_cache_folio()). * * Both folios must be locked, @new->mapping must be set up. */ -void mem_cgroup_migrate(struct folio *old, struct folio *new) +void mem_cgroup_replace_folio(struct folio *old, struct folio *new) { struct mem_cgroup *memcg; long nr_pages = folio_nr_pages(new); @@ -7340,6 +7341,39 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) local_irq_restore(flags); } +/** + * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio. + * @old: Currently circulating folio. + * @new: Replacement folio. + * + * Transfer the memcg data from the old folio to the new folio for migration. + * The old folio's data info will be cleared. Note that the memory counters + * will remain unchanged throughout the process. + * + * Both folios must be locked, @new->mapping must be set up. + */ +void mem_cgroup_migrate(struct folio *old, struct folio *new) +{ + struct mem_cgroup *memcg; + + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new); + VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new); + + if (mem_cgroup_disabled()) + return; + + memcg = folio_memcg(old); + VM_WARN_ON_ONCE_FOLIO(!memcg, old); + if (!memcg) + return; + + /* Transfer the charge and the css ref */ + commit_charge(new, memcg); + old->memcg_data = 0; +} + DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); EXPORT_SYMBOL(memcg_sockets_enabled_key); -- cgit v1.2.3 From 8cba9576df601c384abd334a503c3f6e1e29eefb Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:28 -0700 Subject: hugetlb: memcg: account hugetlb-backed memory in memory controller Currently, hugetlb memory usage is not acounted for in the memory controller, which could lead to memory overprotection for cgroups with hugetlb-backed memory. This has been observed in our production system. For instance, here is one of our usecases: suppose there are two 32G containers. The machine is booted with hugetlb_cma=6G, and each container may or may not use up to 3 gigantic page, depending on the workload within it. The rest is anon, cache, slab, etc. We can set the hugetlb cgroup limit of each cgroup to 3G to enforce hugetlb fairness. But it is very difficult to configure memory.max to keep overall consumption, including anon, cache, slab etc. fair. What we have had to resort to is to constantly poll hugetlb usage and readjust memory.max. Similar procedure is done to other memory limits (memory.low for e.g). However, this is rather cumbersome and buggy. Furthermore, when there is a delay in memory limits correction, (for e.g when hugetlb usage changes within consecutive runs of the userspace agent), the system could be in an over/underprotected state. This patch rectifies this issue by charging the memcg when the hugetlb folio is utilized, and uncharging when the folio is freed (analogous to the hugetlb controller). Note that we do not charge when the folio is allocated to the hugetlb pool, because at this point it is not owned by any memcg. Some caveats to consider: * This feature is only available on cgroup v2. * There is no hugetlb pool management involved in the memory controller. As stated above, hugetlb folios are only charged towards the memory controller when it is used. Host overcommit management has to consider it when configuring hard limits. * Failure to charge towards the memcg results in SIGBUS. This could happen even if the hugetlb pool still has pages (but the cgroup limit is hit and reclaim attempt fails). * When this feature is enabled, hugetlb pages contribute to memory reclaim protection. low, min limits tuning must take into account hugetlb memory. * Hugetlb pages utilized while this option is not selected will not be tracked by the memory controller (even if cgroup v2 is remounted later on). Link: https://lkml.kernel.org/r/20231006184629.155543-4-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Johannes Weiner Cc: Frank van der Linden Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 29 +++++++++++++++++++++++ include/linux/cgroup-defs.h | 5 ++++ include/linux/memcontrol.h | 9 +++++++ kernel/cgroup/cgroup.c | 15 +++++++++++- mm/hugetlb.c | 35 +++++++++++++++++++++------ mm/memcontrol.c | 42 ++++++++++++++++++++++++++++++++- mm/migrate.c | 3 +-- 7 files changed, 127 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 622a7f28db1f..606b2e0eac4b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -210,6 +210,35 @@ cgroup v2 currently supports the following mount options. relying on the original semantics (e.g. specifying bogusly high 'bypass' protection values at higher tree levels). + memory_hugetlb_accounting + Count HugeTLB memory usage towards the cgroup's overall + memory usage for the memory controller (for the purpose of + statistics reporting and memory protetion). This is a new + behavior that could regress existing setups, so it must be + explicitly opted in with this mount option. + + A few caveats to keep in mind: + + * There is no HugeTLB pool management involved in the memory + controller. The pre-allocated pool does not belong to anyone. + Specifically, when a new HugeTLB folio is allocated to + the pool, it is not accounted for from the perspective of the + memory controller. It is only charged to a cgroup when it is + actually used (for e.g at page fault time). Host memory + overcommit management has to consider this when configuring + hard limits. In general, HugeTLB pool management should be + done via other mechanisms (such as the HugeTLB controller). + * Failure to charge a HugeTLB folio to the memory controller + results in SIGBUS. This could happen even if the HugeTLB pool + still has pages available (but the cgroup limit is hit and + reclaim attempt fails). + * Charging HugeTLB memory towards the memory controller affects + memory protection and reclaim dynamics. Any userspace tuning + (of low, min limits for e.g) needs to take this into account. + * HugeTLB pages utilized while this option is not selected + will not be tracked by the memory controller (even if cgroup + v2 is remounted later on). + Organizing Processes and Threads -------------------------------- diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index f1b3151ac30b..8641f4320c98 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -115,6 +115,11 @@ enum { * Enable recursive subtree protection */ CGRP_ROOT_MEMORY_RECURSIVE_PROT = (1 << 18), + + /* + * Enable hugetlb accounting for the memory controller. + */ + CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING = (1 << 19), }; /* cftype->flags */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a3fce570122e..6674c12725d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -678,6 +678,9 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages); + int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1258,6 +1261,12 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } +static inline int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, + gfp_t gfp, long nr_pages) +{ + return 0; +} + static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1fb7f562289d..f11488b18ceb 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1902,6 +1902,7 @@ enum cgroup2_param { Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, + Opt_memory_hugetlb_accounting, nr__cgroup2_params }; @@ -1910,6 +1911,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), + fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting), {} }; @@ -1936,6 +1938,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_memory_recursiveprot: ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; return 0; + case Opt_memory_hugetlb_accounting: + ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + return 0; } return -EINVAL; } @@ -1960,6 +1965,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT; else cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT; + + if (root_flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING; } } @@ -1973,6 +1983,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) seq_puts(seq, ",memory_recursiveprot"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING) + seq_puts(seq, ",memory_hugetlb_accounting"); return 0; } @@ -7050,7 +7062,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, "nsdelegate\n" "favordynmods\n" "memory_localevents\n" - "memory_recursiveprot\n"); + "memory_recursiveprot\n" + "memory_hugetlb_accounting\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e2b1c417b90a..da6f85b7db88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1927,6 +1927,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3026,11 +3027,20 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct folio *folio; - long map_chg, map_commit; + long map_chg, map_commit, nr_pages = pages_per_huge_page(h); long gbl_chg; - int ret, idx; + int memcg_charge_ret, ret, idx; struct hugetlb_cgroup *h_cg = NULL; + struct mem_cgroup *memcg; bool deferred_reserve; + gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL; + + memcg = get_mem_cgroup_from_current(); + memcg_charge_ret = mem_cgroup_hugetlb_try_charge(memcg, gfp, nr_pages); + if (memcg_charge_ret == -ENOMEM) { + mem_cgroup_put(memcg); + return ERR_PTR(-ENOMEM); + } idx = hstate_index(h); /* @@ -3039,8 +3049,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, * code of zero indicates a reservation exists (no change). */ map_chg = gbl_chg = vma_needs_reservation(h, vma, addr); - if (map_chg < 0) + if (map_chg < 0) { + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOMEM); + } /* * Processes that did not create the mapping will have no @@ -3051,10 +3065,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, */ if (map_chg || avoid_reserve) { gbl_chg = hugepage_subpool_get_pages(spool, 1); - if (gbl_chg < 0) { - vma_end_reservation(h, vma, addr); - return ERR_PTR(-ENOSPC); - } + if (gbl_chg < 0) + goto out_end_reservation; /* * Even though there was no reservation in the region/reserve @@ -3136,6 +3148,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); } + + if (!memcg_charge_ret) + mem_cgroup_commit_charge(folio, memcg); + mem_cgroup_put(memcg); + return folio; out_uncharge_cgroup: @@ -3147,7 +3164,11 @@ out_uncharge_cgroup_reservation: out_subpool_put: if (map_chg || avoid_reserve) hugepage_subpool_put_pages(spool, 1); +out_end_reservation: vma_end_reservation(h, vma, addr); + if (!memcg_charge_ret) + mem_cgroup_cancel_charge(memcg, nr_pages); + mem_cgroup_put(memcg); return ERR_PTR(-ENOSPC); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 135d6637afe5..a86e7b445800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7096,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) return ret; } +/** + * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio + * @memcg: memcg to charge. + * @gfp: reclaim mode. + * @nr_pages: number of pages to charge. + * + * This function is called when allocating a huge page folio to determine if + * the memcg has the capacity for it. It does not commit the charge yet, + * as the hugetlb folio itself has not been obtained from the hugetlb pool. + * + * Once we have obtained the hugetlb folio, we can call + * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the + * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect + * of try_charge(). + * + * Returns 0 on success. Otherwise, an error code is returned. + */ +int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp, + long nr_pages) +{ + /* + * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation, + * but do not attempt to commit charge later (or cancel on error) either. + */ + if (mem_cgroup_disabled() || !memcg || + !cgroup_subsys_on_dfl(memory_cgrp_subsys) || + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + return -EOPNOTSUPP; + + if (try_charge(memcg, gfp, nr_pages)) + return -ENOMEM; + + return 0; +} + /** * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. * @folio: folio to charge. @@ -7365,7 +7400,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) return; memcg = folio_memcg(old); - VM_WARN_ON_ONCE_FOLIO(!memcg, old); + /* + * Note that it is normal to see !memcg for a hugetlb folio. + * For e.g, itt could have been allocated when memory_hugetlb_accounting + * was not selected. + */ + VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old); if (!memcg) return; diff --git a/mm/migrate.c b/mm/migrate.c index 9d9eee5322d1..c602bf6dec97 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -633,8 +633,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_copy_owner(newfolio, folio); - if (!folio_test_hugetlb(folio)) - mem_cgroup_migrate(folio, newfolio); + mem_cgroup_migrate(folio, newfolio); } EXPORT_SYMBOL(folio_migrate_flags); -- cgit v1.2.3 From c0dddb7aa5f85e6150a1e3230e01d5ba286eded9 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 6 Oct 2023 11:46:29 -0700 Subject: selftests: add a selftest to verify hugetlb usage in memcg This patch add a new kselftest to demonstrate and verify the new hugetlb memcg accounting behavior. Link: https://lkml.kernel.org/r/20231006184629.155543-5-nphamcs@gmail.com Signed-off-by: Nhat Pham Cc: Frank van der Linden Cc: Johannes Weiner Cc: Michal Hocko Cc: Mike Kravetz Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun heo Cc: Yosry Ahmed Cc: Zefan Li Signed-off-by: Andrew Morton --- MAINTAINERS | 2 + tools/testing/selftests/cgroup/.gitignore | 1 + tools/testing/selftests/cgroup/Makefile | 2 + .../testing/selftests/cgroup/test_hugetlb_memcg.c | 234 +++++++++++++++++++++ 4 files changed, 239 insertions(+) create mode 100644 tools/testing/selftests/cgroup/test_hugetlb_memcg.c diff --git a/MAINTAINERS b/MAINTAINERS index 68ff5b62a1d2..7fd72948d10e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5269,6 +5269,7 @@ S: Maintained F: mm/memcontrol.c F: mm/swap_cgroup.c F: tools/testing/selftests/cgroup/memcg_protection.m +F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c F: tools/testing/selftests/cgroup/test_kmem.c F: tools/testing/selftests/cgroup/test_memcontrol.c @@ -9652,6 +9653,7 @@ F: include/linux/hugetlb.h F: mm/hugetlb.c F: mm/hugetlb_vmemmap.c F: mm/hugetlb_vmemmap.h +F: tools/testing/selftests/cgroup/test_hugetlb_memcg.c HVA ST MEDIA DRIVER M: Jean-Christophe Trotin diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index af8c3f30b9c1..2732e0b29271 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -7,4 +7,5 @@ test_kill test_cpu test_cpuset test_zswap +test_hugetlb_memcg wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index c27f05f6ce9b..00b441928909 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -14,6 +14,7 @@ TEST_GEN_PROGS += test_kill TEST_GEN_PROGS += test_cpu TEST_GEN_PROGS += test_cpuset TEST_GEN_PROGS += test_zswap +TEST_GEN_PROGS += test_hugetlb_memcg LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h @@ -27,3 +28,4 @@ $(OUTPUT)/test_kill: cgroup_util.c $(OUTPUT)/test_cpu: cgroup_util.c $(OUTPUT)/test_cpuset: cgroup_util.c $(OUTPUT)/test_zswap: cgroup_util.c +$(OUTPUT)/test_hugetlb_memcg: cgroup_util.c diff --git a/tools/testing/selftests/cgroup/test_hugetlb_memcg.c b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c new file mode 100644 index 000000000000..f0fefeb4cc24 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_hugetlb_memcg.c @@ -0,0 +1,234 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include "../kselftest.h" +#include "cgroup_util.h" + +#define ADDR ((void *)(0x0UL)) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +/* mapping 8 MBs == 4 hugepages */ +#define LENGTH (8UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* borrowed from mm/hmm-tests.c */ +static long get_hugepage_size(void) +{ + int fd; + char buf[2048]; + int len; + char *p, *q, *path = "/proc/meminfo", *tag = "Hugepagesize:"; + long val; + + fd = open(path, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +static int set_file(const char *path, long value) +{ + FILE *file; + int ret; + + file = fopen(path, "w"); + if (!file) + return -1; + ret = fprintf(file, "%ld\n", value); + fclose(file); + return ret; +} + +static int set_nr_hugepages(long value) +{ + return set_file("/proc/sys/vm/nr_hugepages", value); +} + +static unsigned int check_first(char *addr) +{ + return *(unsigned int *)addr; +} + +static void write_data(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int hugetlb_test_program(const char *cgroup, void *arg) +{ + char *test_group = (char *)arg; + void *addr; + long old_current, expected_current, current; + int ret = EXIT_FAILURE; + + old_current = cg_read_long(test_group, "memory.current"); + set_nr_hugepages(20); + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg( + "setting nr_hugepages should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + return EXIT_FAILURE; + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, 0, 0); + if (addr == MAP_FAILED) { + ksft_print_msg("fail to mmap.\n"); + return EXIT_FAILURE; + } + current = cg_read_long(test_group, "memory.current"); + if (current - old_current >= MB(2)) { + ksft_print_msg("mmap should not increase hugepage usage.\n"); + ksft_print_msg("before: %ld, after: %ld\n", old_current, current); + goto out_failed_munmap; + } + old_current = current; + + /* read the first page */ + check_first(addr); + expected_current = old_current + MB(2); + current = cg_read_long(test_group, "memory.current"); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 2MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* write to the whole range */ + write_data(addr); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current + MB(8); + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should increase by around 8MB.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + goto out_failed_munmap; + } + + /* unmap the whole range */ + munmap(addr, LENGTH); + current = cg_read_long(test_group, "memory.current"); + expected_current = old_current; + if (!values_close(expected_current, current, 5)) { + ksft_print_msg("memory usage should go back down.\n"); + ksft_print_msg( + "expected memory: %ld, actual memory: %ld\n", + expected_current, current); + return ret; + } + + ret = EXIT_SUCCESS; + return ret; + +out_failed_munmap: + munmap(addr, LENGTH); + return ret; +} + +static int test_hugetlb_memcg(char *root) +{ + int ret = KSFT_FAIL; + char *test_group; + + test_group = cg_name(root, "hugetlb_memcg_test"); + if (!test_group || cg_create(test_group)) { + ksft_print_msg("fail to create cgroup.\n"); + goto out; + } + + if (cg_write(test_group, "memory.max", "100M")) { + ksft_print_msg("fail to set cgroup memory limit.\n"); + goto out; + } + + /* disable swap */ + if (cg_write(test_group, "memory.swap.max", "0")) { + ksft_print_msg("fail to disable swap.\n"); + goto out; + } + + if (!cg_run(test_group, hugetlb_test_program, (void *)test_group)) + ret = KSFT_PASS; +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + int ret = EXIT_SUCCESS, has_memory_hugetlb_acc; + + has_memory_hugetlb_acc = proc_mount_contains("memory_hugetlb_accounting"); + if (has_memory_hugetlb_acc < 0) + ksft_exit_skip("Failed to query cgroup mount option\n"); + else if (!has_memory_hugetlb_acc) + ksft_exit_skip("memory hugetlb accounting is disabled\n"); + + /* Unit is kB! */ + if (get_hugepage_size() != 2048) { + ksft_print_msg("test_hugetlb_memcg requires 2MB hugepages\n"); + ksft_test_result_skip("test_hugetlb_memcg\n"); + return ret; + } + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + switch (test_hugetlb_memcg(root)) { + case KSFT_PASS: + ksft_test_result_pass("test_hugetlb_memcg\n"); + break; + case KSFT_SKIP: + ksft_test_result_skip("test_hugetlb_memcg\n"); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("test_hugetlb_memcg\n"); + break; + } + + return ret; +} -- cgit v1.2.3 From 7a81751fcdeb833acc858e59082688e3020bfe12 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Mon, 25 Sep 2023 13:01:10 -0700 Subject: mm/thp: fix "mm: thp: kill __transhuge_page_enabled()" The 6.0 commits: commit 9fec51689ff6 ("mm: thp: kill transparent_hugepage_active()") commit 7da4e2cb8b1f ("mm: thp: kill __transhuge_page_enabled()") merged "can we have THPs in this VMA?" logic that was previously done separately by fault-path, khugepaged, and smaps "THPeligible" checks. During the process, the semantics of the fault path check changed in two ways: 1) A VM_NO_KHUGEPAGED check was introduced (also added to smaps path). 2) We no longer checked if non-anonymous memory had a vm_ops->huge_fault handler that could satisfy the fault. Previously, this check had been done in create_huge_pud() and create_huge_pmd() routines, but after the changes, we never reach those routines. During the review of the above commits, it was determined that in-tree users weren't affected by the change; most notably, since the only relevant user (in terms of THP) of VM_MIXEDMAP or ->huge_fault is DAX, which is explicitly approved early in approval logic. However, this was a bad assumption to make as it assumes the only reason to support ->huge_fault was for DAX (which is not true in general). Remove the VM_NO_KHUGEPAGED check when not in collapse path and give any ->huge_fault handler a chance to handle the fault. Note that we don't validate the file mode or mapping alignment, which is consistent with the behavior before the aforementioned commits. Link: https://lkml.kernel.org/r/20230925200110.1979606-1-zokeefe@google.com Fixes: 7da4e2cb8b1f ("mm: thp: kill __transhuge_page_enabled()") Reported-by: Saurabh Singh Sengar Signed-off-by: Zach O'Keefe Cc: Yang Shi Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 51a66eb48938..c9cbcbf6697e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -100,11 +100,11 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, return in_pf; /* - * Special VMA and hugetlb VMA. + * khugepaged special VMA and hugetlb VMA. * Must be checked after dax since some dax mappings may have * VM_MIXEDMAP set. */ - if (vm_flags & VM_NO_KHUGEPAGED) + if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED)) return false; /* @@ -132,12 +132,18 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, !hugepage_flags_always()))) return false; - /* Only regular file is valid */ - if (!in_pf && file_thp_enabled(vma)) - return true; - - if (!vma_is_anonymous(vma)) + if (!vma_is_anonymous(vma)) { + /* + * Trust that ->huge_fault() handlers know what they are doing + * in fault path. + */ + if (((in_pf || smaps)) && vma->vm_ops->huge_fault) + return true; + /* Only regular file is valid in collapse path */ + if (((!in_pf || smaps)) && file_thp_enabled(vma)) + return true; return false; + } if (vma_is_temporary_stack(vma)) return false; -- cgit v1.2.3 From f04eba134e598d4a5af2eeecff0562df5042cbfc Mon Sep 17 00:00:00 2001 From: Lucy Mielke Date: Fri, 6 Oct 2023 22:30:51 +0200 Subject: mm: add printf attribute to shrinker_debugfs_name_alloc This fixes a compiler warning when compiling an allyesconfig with W=1: mm/internal.h:1235:9: error: function might be a candidate for `gnu_printf' format attribute [-Werror=suggest-attribute=format] [akpm@linux-foundation.org: fix shrinker_alloc() as welll per Qi Zheng] Link: https://lkml.kernel.org/r/822387b7-4895-4e64-5806-0f56b5d6c447@bytedance.com Link: https://lkml.kernel.org/r/ZSBue-3kM6gI6jCr@mainframe Fixes: c42d50aefd17 ("mm: shrinker: add infrastructure for dynamically allocating shrinker") Signed-off-by: Lucy Mielke Cc: Qi Zheng Signed-off-by: Andrew Morton --- include/linux/shrinker.h | 1 + mm/internal.h | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index e4f93120e0ab..1a00be90d93a 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -131,6 +131,7 @@ struct shrinker { */ #define SHRINKER_NONSLAB BIT(4) +__printf(2, 3) struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...); void shrinker_register(struct shrinker *shrinker); void shrinker_free(struct shrinker *shrinker); diff --git a/mm/internal.h b/mm/internal.h index 2b79da9deb64..d1f7a3c612fe 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1219,8 +1219,8 @@ unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority); #ifdef CONFIG_SHRINKER_DEBUG -static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker, - const char *fmt, va_list ap) +static inline __printf(2, 0) int shrinker_debugfs_name_alloc( + struct shrinker *shrinker, const char *fmt, va_list ap) { shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap); -- cgit v1.2.3 From 9b91432985851e5d55a41478bc0ecf93bf746981 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 10 Oct 2023 08:25:02 +0100 Subject: mm/mprotect: allow unfaulted VMAs to be unaccounted on mprotect() When mprotect() is used to make unwritable VMAs writable, they have the VM_ACCOUNT flag applied and memory accounted accordingly. If the VMA has had no pages faulted in and is then made unwritable once again, it will remain accounted for, despite not being capable of extending memory usage. Consider:- ptr = mmap(NULL, page_size * 3, PROT_READ, MAP_ANON | MAP_PRIVATE, -1, 0); mprotect(ptr + page_size, page_size, PROT_READ | PROT_WRITE); mprotect(ptr + page_size, page_size, PROT_READ); The first mprotect() splits the range into 3 VMAs and the second fails to merge the three as the middle VMA has VM_ACCOUNT set and the others do not, rendering them unmergeable. This is unnecessary, since no pages have actually been allocated and the middle VMA is not capable of utilising more memory, thereby introducing unnecessary VMA fragmentation (and accounting for more memory than is necessary). Since we cannot efficiently determine which pages map to an anonymous VMA, we have to be very conservative - determining whether any pages at all have been faulted in, by checking whether vma->anon_vma is NULL. We can see that the lack of anon_vma implies that no anonymous pages are present as evidenced by vma_needs_copy() utilising this on fork to determine whether page tables need to be copied. The only place where anon_vma is set NULL explicitly is on fork with VM_WIPEONFORK set, however since this flag is intended to cause the child process to not CoW on a given memory range, it is right to interpret this as indicating the VMA has no faulted-in anonymous memory mapped. If the VMA was forked without VM_WIPEONFORK set, then anon_vma_fork() will have ensured that a new anon_vma is assigned (and correctly related to its parent anon_vma) should any pages be CoW-mapped. The overall operation is safe against races as we hold a write lock against mm->mmap_lock. If we could efficiently look up the VMA's faulted-in pages then we would unaccount all those pages not yet faulted in. However as the original comment alludes this simply isn't currently possible, so we are conservative and account all pages or none at all. Link: https://lkml.kernel.org/r/ad5540371a16623a069f03f4db1739f33cde1fab.1696921767.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Mike Rapoport (IBM) Cc: David Hildenbrand Cc: Liam R. Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mprotect.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index b94fbb45d5c7..03e2cec3e669 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -608,8 +608,11 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, /* * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we - * make it unwritable again. hugetlb mapping were accounted for - * even if read-only so there is no need to account for them here + * make it unwritable again except in the anonymous case where no + * anon_vma has yet to be assigned. + * + * hugetlb mapping were accounted for even if read-only so there is + * no need to account for them here. */ if (newflags & VM_WRITE) { /* Check space limits when area turns into data. */ @@ -623,6 +626,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, return -ENOMEM; newflags |= VM_ACCOUNT; } + } else if ((oldflags & VM_ACCOUNT) && vma_is_anonymous(vma) && + !vma->anon_vma) { + newflags &= ~VM_ACCOUNT; } /* @@ -665,6 +671,9 @@ success: change_protection(tlb, vma, start, end, mm_cp_flags); + if ((oldflags & VM_ACCOUNT) && !(newflags & VM_ACCOUNT)) + vm_unacct_memory(nrpages); + /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. -- cgit v1.2.3 From b0b598ee08f9759b70971e297cf7ddd3eaaa5245 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 04:58:28 +0100 Subject: filemap: remove use of wait bookmarks The original problem of the overly long list of waiters on a locked page was solved properly by commit 9a1ea439b16b ("mm: put_and_wait_on_page_locked() while page is migrated"). In the meantime, using bookmarks for the writeback bit can cause livelocks, so we need to stop using them. Link: https://lkml.kernel.org/r/20231010035829.544242-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Bin Lai Cc: Benjamin Segall Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Valentin Schneider Cc: Vincent Guittot Cc: Ingo Molnar Signed-off-by: Andrew Morton --- mm/filemap.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 48cd16c54e86..9ef49255f1a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1133,32 +1133,13 @@ static void folio_wake_bit(struct folio *folio, int bit_nr) wait_queue_head_t *q = folio_waitqueue(folio); struct wait_page_key key; unsigned long flags; - wait_queue_entry_t bookmark; key.folio = folio; key.bit_nr = bit_nr; key.page_match = 0; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); - spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); - - while (bookmark.flags & WQ_FLAG_BOOKMARK) { - /* - * Take a breather from holding the lock, - * allow pages that finish wake up asynchronously - * to acquire the lock and remove themselves - * from wait queue - */ - spin_unlock_irqrestore(&q->lock, flags); - cpu_relax(); - spin_lock_irqsave(&q->lock, flags); - __wake_up_locked_key_bookmark(q, TASK_NORMAL, &key, &bookmark); - } + __wake_up_locked_key(q, TASK_NORMAL, &key); /* * It's possible to miss clearing waiters here, when we woke our page -- cgit v1.2.3 From 37acade0ce8938f00d6979bd02b8043b5b7089ae Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 10 Oct 2023 04:58:29 +0100 Subject: sched: remove wait bookmarks There are no users of wait bookmarks left, so simplify the wait code by removing them. Link: https://lkml.kernel.org/r/20231010035829.544242-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ingo Molnar Cc: Benjamin Segall Cc: Bin Lai Cc: Daniel Bristot de Oliveira Cc: Dietmar Eggemann Cc: Ingo Molnar Cc: Juri Lelli Cc: Mel Gorman Cc: Peter Zijlstra Cc: Steven Rostedt (Google) Cc: Valentin Schneider Cc: Vincent Guittot Signed-off-by: Andrew Morton --- include/linux/wait.h | 9 +++----- kernel/sched/wait.c | 60 +++++++++------------------------------------------- 2 files changed, 13 insertions(+), 56 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5ec7739400f4..3473b663176f 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -19,10 +19,9 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int /* wait_queue_entry::flags */ #define WQ_FLAG_EXCLUSIVE 0x01 #define WQ_FLAG_WOKEN 0x02 -#define WQ_FLAG_BOOKMARK 0x04 -#define WQ_FLAG_CUSTOM 0x08 -#define WQ_FLAG_DONE 0x10 -#define WQ_FLAG_PRIORITY 0x20 +#define WQ_FLAG_CUSTOM 0x04 +#define WQ_FLAG_DONE 0x08 +#define WQ_FLAG_PRIORITY 0x10 /* * A single wait-queue entry structure: @@ -212,8 +211,6 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key); void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark); void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key); void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 802d98cf2de3..51e38f5f4701 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -57,13 +57,6 @@ void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry } EXPORT_SYMBOL(remove_wait_queue); -/* - * Scan threshold to break wait queue walk. - * This allows a waker to take a break from holding the - * wait queue lock during the wait queue walk. - */ -#define WAITQUEUE_WALK_BREAK_CNT 64 - /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve @@ -78,21 +71,13 @@ EXPORT_SYMBOL(remove_wait_queue); * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, - int nr_exclusive, int wake_flags, void *key, - wait_queue_entry_t *bookmark) + int nr_exclusive, int wake_flags, void *key) { wait_queue_entry_t *curr, *next; - int cnt = 0; lockdep_assert_held(&wq_head->lock); - if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { - curr = list_next_entry(bookmark, entry); - - list_del(&bookmark->entry); - bookmark->flags = 0; - } else - curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); if (&curr->entry == &wq_head->head) return nr_exclusive; @@ -101,21 +86,11 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, unsigned flags = curr->flags; int ret; - if (flags & WQ_FLAG_BOOKMARK) - continue; - ret = curr->func(curr, mode, wake_flags, key); if (ret < 0) break; if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; - - if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && - (&next->entry != &wq_head->head)) { - bookmark->flags = WQ_FLAG_BOOKMARK; - list_add_tail(&bookmark->entry, &next->entry); - break; - } } return nr_exclusive; @@ -125,20 +100,12 @@ static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int m int nr_exclusive, int wake_flags, void *key) { unsigned long flags; - wait_queue_entry_t bookmark; - int remaining = nr_exclusive; + int remaining; - bookmark.flags = 0; - bookmark.private = NULL; - bookmark.func = NULL; - INIT_LIST_HEAD(&bookmark.entry); - - do { - spin_lock_irqsave(&wq_head->lock, flags); - remaining = __wake_up_common(wq_head, mode, remaining, - wake_flags, key, &bookmark); - spin_unlock_irqrestore(&wq_head->lock, flags); - } while (bookmark.flags & WQ_FLAG_BOOKMARK); + spin_lock_irqsave(&wq_head->lock, flags); + remaining = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, + key); + spin_unlock_irqrestore(&wq_head->lock, flags); return nr_exclusive - remaining; } @@ -171,23 +138,16 @@ void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode */ void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) { - __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); + __wake_up_common(wq_head, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, 0, key, NULL); + __wake_up_common(wq_head, mode, 1, 0, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_key); -void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head, - unsigned int mode, void *key, wait_queue_entry_t *bookmark) -{ - __wake_up_common(wq_head, mode, 1, 0, key, bookmark); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key_bookmark); - /** * __wake_up_sync_key - wake up threads blocked on a waitqueue. * @wq_head: the waitqueue @@ -233,7 +193,7 @@ EXPORT_SYMBOL_GPL(__wake_up_sync_key); void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) { - __wake_up_common(wq_head, mode, 1, WF_SYNC, key, NULL); + __wake_up_common(wq_head, mode, 1, WF_SYNC, key); } EXPORT_SYMBOL_GPL(__wake_up_locked_sync_key); -- cgit v1.2.3 From 3657fdc2451abf135c2d20949acf57d78cc50338 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:27 +0100 Subject: mm: move vma_policy() and anon_vma_name() decls to mm_types.h Patch series "Abstract vma_merge() and split_vma()", v4. The vma_merge() interface is very confusing and its implementation has led to numerous bugs as a result of that confusion. In addition there is duplication both in invocation of vma_merge(), but also in the common mprotect()-style pattern of attempting a merge, then if this fails, splitting the portion of a VMA about to have its attributes changed. This pattern has been copy/pasted around the kernel in each instance where such an operation has been required, each very slightly modified from the last to make it even harder to decipher what is going on. Simplify the whole thing by dividing the actual uses of vma_merge() and split_vma() into specific and abstracted functions and de-duplicate the vma_merge()/split_vma() pattern altogether. Doing so also opens the door to changing how vma_merge() is implemented - by knowing precisely what cases a caller is invoking rather than having a central interface where anything might happen we can untangle the brittle and confusing vma_merge() implementation into something more workable. For mprotect()-like cases we introduce vma_modify() which performs the vma_merge()/split_vma() pattern, returning a pointer to either the merged or split VMA or an ERR_PTR(err) if the splits fail. We provide a number of inline helper functions to make things even clearer:- * vma_modify_flags() - Prepare to modify the VMA's flags. * vma_modify_flags_name() - Prepare to modify the VMA's flags/anon_vma_name * vma_modify_policy() - Prepare to modify the VMA's mempolicy. * vma_modify_flags_uffd() - Prepare to modify the VMA's flags/uffd context. For cases where a new VMA is attempted to be merged with adjacent VMAs we add:- * vma_merge_new_vma() - Prepare to merge a new VMA. * vma_merge_extend() - Prepare to extend the end of a new VMA. This patch (of 5): The vma_policy() define is a helper specifically for a VMA field so it makes sense to host it in the memory management types header. The anon_vma_name(), anon_vma_name_alloc() and anon_vma_name_free() functions are a little out of place in mm_inline.h as they define external functions, and so it makes sense to locate them in mm_types.h. The purpose of these relocations is to make it possible to abstract static inline wrappers which invoke both of these helpers. Link: https://lkml.kernel.org/r/cover.1697043508.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/24bfc6c9e382fffbcb0ea8d424392c27d56cc8ca.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 4 ---- include/linux/mm_inline.h | 20 +------------------- include/linux/mm_types.h | 27 +++++++++++++++++++++++++++ 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 6c2754d7bfed..1d7f4ec2614c 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -89,8 +89,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) return pol; } -#define vma_policy(vma) ((vma)->vm_policy) - static inline void mpol_get(struct mempolicy *pol) { if (pol) @@ -222,8 +220,6 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) return NULL; } -#define vma_policy(vma) NULL - static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8148b30a9df1..9ae7def16cb2 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -352,15 +353,6 @@ void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) } #ifdef CONFIG_ANON_VMA_NAME -/* - * mmap_lock should be read-locked when calling anon_vma_name(). Caller should - * either keep holding the lock while using the returned pointer or it should - * raise anon_vma_name refcount before releasing the lock. - */ -extern struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); -extern struct anon_vma_name *anon_vma_name_alloc(const char *name); -extern void anon_vma_name_free(struct kref *kref); - /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { @@ -415,16 +407,6 @@ static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, } #else /* CONFIG_ANON_VMA_NAME */ -static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) -{ - return NULL; -} - -static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) -{ - return NULL; -} - static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36c5b43999e6..21eb56145f57 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -546,6 +546,27 @@ struct anon_vma_name { char name[]; }; +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling anon_vma_name(). Caller should + * either keep holding the lock while using the returned pointer or it should + * raise anon_vma_name refcount before releasing the lock. + */ +struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma); +struct anon_vma_name *anon_vma_name_alloc(const char *name); +void anon_vma_name_free(struct kref *kref); +#else /* CONFIG_ANON_VMA_NAME */ +static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + return NULL; +} +#endif + struct vma_lock { struct rw_semaphore lock; }; @@ -662,6 +683,12 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; +#ifdef CONFIG_NUMA +#define vma_policy(vma) ((vma)->vm_policy) +#else +#define vma_policy(vma) NULL +#endif + #ifdef CONFIG_SCHED_MM_CID struct mm_cid { u64 time; -- cgit v1.2.3 From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:28 +0100 Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al. mprotect() and other functions which change VMA parameters over a range each employ a pattern of:- 1. Attempt to merge the range with adjacent VMAs. 2. If this fails, and the range spans a subset of the VMA, split it accordingly. This is open-coded and duplicated in each case. Also in each case most of the parameters passed to vma_merge() remain the same. Create a new function, vma_modify(), which abstracts this operation, accepting only those parameters which can be changed. To avoid the mess of invoking each function call with unnecessary parameters, create inline wrapper functions for each of the modify operations, parameterised only by what is required to perform the action. We can also significantly simplify the logic - by returning the VMA if we split (or merged VMA if we do not) we no longer need specific handling for merge/split cases in any of the call sites. Note that the userfaultfd_release() case works even though it does not split VMAs - since start is set to vma->vm_start and end is set to vma->vm_end, the split logic does not trigger. In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start - vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this instance, this invocation will remain unchanged. We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that vma_merge() correctly ensures that flags remain the same, something that is already checked in is_mergeable_vma() and elsewhere, and in any case is not specific to mprotect(). Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 70 ++++++++++++++---------------------------------------- include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 26 ++++---------------- mm/mempolicy.c | 26 +++----------------- mm/mlock.c | 25 ++++--------------- mm/mmap.c | 48 +++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 29 ++++------------------ 7 files changed, 141 insertions(+), 143 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a7c6ef764e63..ac616cfbacf5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -927,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - } else { - prev = vma; - } + vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, + vma->vm_end, new_flags, + NULL_VM_UFFD_CTX); vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + prev = vma; } mmap_write_unlock(mm); mmput(mm); @@ -1331,7 +1326,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); - pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1484,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx }), - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and @@ -1568,7 +1548,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, const void __user *buf = (void __user *)arg; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); - pgoff_t pgoff; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1671,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, NULL_VM_UFFD_CTX); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and diff --git a/include/linux/mm.h b/include/linux/mm.h index fa608cba041f..895feb814620 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3251,6 +3251,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name); + +/* We are about to modify the VMA's flags. */ +static inline struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or anon_name. */ +static inline struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); +} + +/* We are about to modify the VMA's memory policy. */ +static inline struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or uffd context. */ +static inline struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), new_ctx, anon_vma_name(vma)); +} static inline int check_data_rlimit(unsigned long rlim, unsigned long new, diff --git a/mm/madvise.c b/mm/madvise.c index 59e8860c86af..a6b48d4796ba 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; int error; - pgoff_t pgoff; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { @@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_name); - if (*prev) { - vma = *prev; - goto success; - } + vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, + anon_name); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; - if (start != vma->vm_start) { - error = split_vma(&vmi, vma, start, 1); - if (error) - return error; - } - - if (end != vma->vm_end) { - error = split_vma(&vmi, vma, end, 0); - if (error) - return error; - } - -success: /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 38a47fa33ef4..8bb5b4b4b395 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -811,10 +811,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *merged; unsigned long vmstart, vmend; - pgoff_t pgoff; - int err; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { @@ -829,26 +826,9 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol, - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (merged) { - *prev = merged; - return vma_replace_policy(merged, new_pol); - } - - if (vma->vm_start != vmstart) { - err = split_vma(vmi, vma, vmstart, 1); - if (err) - return err; - } - - if (vma->vm_end != vmend) { - err = split_vma(vmi, vma, vmend, 0); - if (err) - return err; - } + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); diff --git a/mm/mlock.c b/mm/mlock.c index 42b6865f8f82..aa44456200e3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -476,7 +476,6 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - pgoff_t pgoff; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; @@ -487,28 +486,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(vmi, mm, *prev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*prev) { - vma = *prev; - goto success; - } - - if (start != vma->vm_start) { - ret = split_vma(vmi, vma, start, 1); - if (ret) - goto out; - } - - if (end != vma->vm_end) { - ret = split_vma(vmi, vma, end, 0); - if (ret) - goto out; + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; } -success: /* * Keep track of amount of locked VM. */ diff --git a/mm/mmap.c b/mm/mmap.c index 673429ee8a9e..bca685820763 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2437,6 +2437,54 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name) +{ + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + struct vm_area_struct *merged; + + merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, + vma->anon_vma, vma->vm_file, pgoff, policy, + uffd_ctx, anon_name); + if (merged) + return merged; + + if (vma->vm_start < start) { + int err = split_vma(vmi, vma, start, 1); + + if (err) + return ERR_PTR(err); + } + + if (vma->vm_end > end) { + int err = split_vma(vmi, vma, end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mprotect.c b/mm/mprotect.c index 03e2cec3e669..f1dc8f8c84ef 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -581,7 +581,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; - pgoff_t pgoff; int error; if (newflags == oldflags) { @@ -631,34 +630,14 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, newflags &= ~VM_ACCOUNT; } - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*pprev) { - vma = *pprev; - VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); - goto success; + vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + if (IS_ERR(vma)) { + error = PTR_ERR(vma); + goto fail; } *pprev = vma; - if (start != vma->vm_start) { - error = split_vma(vmi, vma, start, 1); - if (error) - goto fail; - } - - if (end != vma->vm_end) { - error = split_vma(vmi, vma, end, 0); - if (error) - goto fail; - } - -success: /* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. -- cgit v1.2.3 From adb20b0c785e9e8d5e4d4a07b9c3340d7de0e5dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:29 +0100 Subject: mm: make vma_merge() and split_vma() internal Now the common pattern of - attempting a merge via vma_merge() and should this fail splitting VMAs via split_vma() - has been abstracted, the former can be placed into mm/internal.h and the latter made static. In addition, the split_vma() nommu variant also need not be exported. Link: https://lkml.kernel.org/r/405f2be10e20c4e9fbcc9fe6b2dfea105f6642e0.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 --------- mm/internal.h | 9 +++++++++ mm/mmap.c | 8 ++++---- mm/nommu.c | 4 ++-- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 895feb814620..1bddb151cd5c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3235,16 +3235,7 @@ extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *next); extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); -extern struct vm_area_struct *vma_merge(struct vma_iterator *vmi, - struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, struct anon_vma *, - struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); -extern int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *, - unsigned long addr, int new_below); -extern int split_vma(struct vma_iterator *vmi, struct vm_area_struct *, - unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, diff --git a/mm/internal.h b/mm/internal.h index d1f7a3c612fe..2ab81c68be6c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1002,6 +1002,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); +/* + * mm/mmap.c + */ +struct vm_area_struct *vma_merge(struct vma_iterator *vmi, + struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, + unsigned long end, unsigned long vm_flags, struct anon_vma *, + struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, + struct anon_vma_name *); + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, diff --git a/mm/mmap.c b/mm/mmap.c index bca685820763..a516f2412f79 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2346,8 +2346,8 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas, * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the end VMA. */ -int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; @@ -2428,8 +2428,8 @@ out_free_vma: * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index f9553579389b..fc4afe924ad5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1305,8 +1305,8 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. */ -int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long addr, int new_below) +static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long addr, int new_below) { struct vm_area_struct *new; struct vm_region *region; -- cgit v1.2.3 From 4b5f2d20169827add8875e78a352cf956ccdd55a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:30 +0100 Subject: mm: abstract merge for new VMAs into vma_merge_new_vma() Only in mmap_region() and copy_vma() do we attempt to merge VMAs which occupy entirely new regions of virtual memory. We can abstract this logic and make the intent of this invocations of it completely explicit, rather than invoking vma_merge() with an inscrutable wall of parameters. This also paves the way for a simplification of the core vma_merge() implementation, as we seek to make it entirely an implementation detail. The VMA merge call in mmap_region() occurs only for file-backed mappings, where each of the parameters previously specified as NULL are defaulted to NULL in vma_init() (called by vm_area_alloc()). This matches the previous behaviour of specifying NULL for a number of fields, however note that prior to this call we pass the VMA to the file system driver via call_mmap(), which may in theory adjust fields that we pass in to vma_merge_new_vma(). Therefore we actually resolve an oversight here by allowing for the fact that the driver may have done this. Link: https://lkml.kernel.org/r/3dc71d17e307756a54781d4a4ce7315cf8b18bea.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index a516f2412f79..e5e50e547ebf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2485,6 +2485,20 @@ struct vm_area_struct *vma_modify(struct vma_iterator *vmi, return vma; } +/* + * Attempt to merge a newly mapped VMA with those adjacent to it. The caller + * must ensure that [start, end) does not overlap any existing VMA. + */ +static struct vm_area_struct +*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgoff_t pgoff) +{ + return vma_merge(vmi, vma->vm_mm, prev, start, end, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2840,10 +2854,9 @@ cannot_expand: * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(&vmi, mm, prev, vma->vm_start, - vma->vm_end, vma->vm_flags, NULL, - vma->vm_file, vma->vm_pgoff, NULL, - NULL_VM_UFFD_CTX, NULL); + merge = vma_merge_new_vma(&vmi, prev, vma, + vma->vm_start, vma->vm_end, + vma->vm_pgoff); if (merge) { /* * ->mmap() can change vma->vm_file and fput @@ -3385,9 +3398,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ - new_vma = vma_merge(&vmi, mm, prev, addr, addr + len, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff); if (new_vma) { /* * Source vma may have been merged into new_vma -- cgit v1.2.3 From 93bf5d4aa27d4ba528f71483ae51fbd70edb3ce8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:31 +0100 Subject: mm: abstract VMA merge and extend into vma_merge_extend() helper mremap uses vma_merge() in the case where a VMA needs to be extended. This can be significantly simplified and abstracted. This makes it far easier to understand what the actual function is doing, avoids future mistakes in use of the confusing vma_merge() function and importantly allows us to make future changes to how vma_merge() is implemented by knowing explicitly which merge cases each invocation uses. Note that in the mremap() extend case, we perform this merge only when old_len == vma->vm_end - addr. The extension_start, i.e. the start of the extended portion of the VMA is equal to addr + old_len, i.e. vma->vm_end. With this refactoring, vma_merge() is no longer required anywhere except mm/mmap.c, so mark it static. Link: https://lkml.kernel.org/r/f16cbdc2e72d37a1a097c39dc7d1fee8919a1c93.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/internal.h | 8 +++----- mm/mmap.c | 31 ++++++++++++++++++++++++------- mm/mremap.c | 30 +++++++++++++----------------- 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 2ab81c68be6c..c61a98d3b3c7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1005,11 +1005,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* * mm/mmap.c */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, - struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, struct anon_vma *, - struct file *, pgoff_t, struct mempolicy *, struct vm_userfaultfd_ctx, - struct anon_vma_name *); +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta); enum { /* mark page accessed */ diff --git a/mm/mmap.c b/mm/mmap.c index e5e50e547ebf..3ea52451623b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -860,13 +860,13 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, * **** is not represented - it will be merged and the vma containing the * area is returned, or the function will return NULL */ -struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, - struct vm_area_struct *prev, unsigned long addr, - unsigned long end, unsigned long vm_flags, - struct anon_vma *anon_vma, struct file *file, - pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx, - struct anon_vma_name *anon_name) +static struct vm_area_struct +*vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, + struct vm_area_struct *prev, unsigned long addr, unsigned long end, + unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, + pgoff_t pgoff, struct mempolicy *policy, + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + struct anon_vma_name *anon_name) { struct vm_area_struct *curr, *next, *res; struct vm_area_struct *vma, *adjust, *remove, *remove2; @@ -2499,6 +2499,23 @@ static struct vm_area_struct vma->vm_userfaultfd_ctx, anon_vma_name(vma)); } +/* + * Expand vma by delta bytes, potentially merging with an immediately adjacent + * VMA with identical properties. + */ +struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, + struct vm_area_struct *vma, + unsigned long delta) +{ + pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma); + + /* vma is specified as prev, so case 1 or 2 will apply. */ + return vma_merge(vmi, vma->vm_mm, vma, vma->vm_end, vma->vm_end + delta, + vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mremap.c b/mm/mremap.c index ce8a23ef325a..38d98465f3d8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1096,14 +1096,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* old_len exactly to the end of the area.. */ if (old_len == vma->vm_end - addr) { + unsigned long delta = new_len - old_len; + /* can we just expand the current mapping? */ - if (vma_expandable(vma, new_len - old_len)) { - long pages = (new_len - old_len) >> PAGE_SHIFT; - unsigned long extension_start = addr + old_len; - unsigned long extension_end = addr + new_len; - pgoff_t extension_pgoff = vma->vm_pgoff + - ((extension_start - vma->vm_start) >> PAGE_SHIFT); - VMA_ITERATOR(vmi, mm, extension_start); + if (vma_expandable(vma, delta)) { + long pages = delta >> PAGE_SHIFT; + VMA_ITERATOR(vmi, mm, vma->vm_end); long charged = 0; if (vma->vm_flags & VM_ACCOUNT) { @@ -1115,17 +1113,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } /* - * Function vma_merge() is called on the extension we - * are adding to the already existing vma, vma_merge() - * will merge this extension with the already existing - * vma (expand operation itself) and possibly also with - * the next vma if it becomes adjacent to the expanded - * vma and otherwise compatible. + * Function vma_merge_extend() is called on the + * extension we are adding to the already existing vma, + * vma_merge_extend() will merge this extension with the + * already existing vma (expand operation itself) and + * possibly also with the next vma if it becomes + * adjacent to the expanded vma and otherwise + * compatible. */ - vma = vma_merge(&vmi, mm, vma, extension_start, - extension_end, vma->vm_flags, vma->anon_vma, - vma->vm_file, extension_pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + vma = vma_merge_extend(&vmi, vma, delta); if (!vma) { vm_unacct_memory(charged); ret = -ENOMEM; -- cgit v1.2.3 From b459f0905eec31196b6e841773aabe3194e3c3fe Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:45 -0400 Subject: mm/page_owner: remove free_ts from page_owner output Patch series "Fix page_owner's use of free timestamps". While page ower output is used to investigate memory utilization, typically the allocation pathway, the introduction of timestamps to the page owner records caused each record to become unique due to the granularity of the nanosecond timestamp (for example): Page allocated via order 0 ... ts 5206196026 ns, free_ts 5187156703 ns Page allocated via order 0 ... ts 5206198540 ns, free_ts 5187162702 ns Furthermore, the page_owner output only dumps the currently allocated records, so having the free timestamps is nonsensical for the typical use case. In addition, the introduction of timestamps was not properly handled in the page_owner_sort tool causing most use cases to be broken. This series is meant to remove the free timestamps from the page_owner output and fix the page_owner_sort tool so proper collation can occur. This patch (of 5): When printing page_owner data via the sysfs interface, no free pages will ever be dumped due to the series of checks in read_page_owner(): /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) The free_ts values are still used when dump_page_owner() is called, so keeping the field for other use cases but removing them for the typical page_owner case. Link: https://lkml.kernel.org/r/20231013190350.579407-1-audra@redhat.com Link: https://lkml.kernel.org/r/20231013190350.579407-2-audra@redhat.com Fixes: 866b48526217 ("mm/page_owner: record the timestamp of all pages during free") Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Reviewed-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- mm/page_owner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 4e2723e1b300..4f13ce7d2452 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -408,11 +408,11 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = scnprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, page_owner->tgid, page_owner->comm, - page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->ts_nsec); /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); -- cgit v1.2.3 From 0179c62839bdc49769a986e6eb1a6ca6fc7d274a Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:46 -0400 Subject: tools/mm: remove references to free_ts from page_owner_sort With the removal of free timestamps from page_owner output, we no longer need to handle this case or the "unreleased" case. Remove all references to both cases. Link: https://lkml.kernel.org/r/20231013190350.579407-3-audra@redhat.com Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Reviewed-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 98 ++++++---------------------------------------- 1 file changed, 12 insertions(+), 86 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 99798894b879..9c93f3f4514f 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -33,7 +33,6 @@ struct block_list { char *comm; // task command name char *stacktrace; __u64 ts_nsec; - __u64 free_ts_nsec; int len; int num; int page_num; @@ -42,18 +41,16 @@ struct block_list { int allocator; }; enum FILTER_BIT { - FILTER_UNRELEASE = 1<<1, - FILTER_PID = 1<<2, - FILTER_TGID = 1<<3, - FILTER_COMM = 1<<4 + FILTER_PID = 1<<1, + FILTER_TGID = 1<<2, + FILTER_COMM = 1<<3 }; enum CULL_BIT { - CULL_UNRELEASE = 1<<1, - CULL_PID = 1<<2, - CULL_TGID = 1<<3, - CULL_COMM = 1<<4, - CULL_STACKTRACE = 1<<5, - CULL_ALLOCATOR = 1<<6 + CULL_PID = 1<<1, + CULL_TGID = 1<<2, + CULL_COMM = 1<<3, + CULL_STACKTRACE = 1<<4, + CULL_ALLOCATOR = 1<<5 }; enum ALLOCATOR_BIT { ALLOCATOR_CMA = 1<<1, @@ -62,9 +59,8 @@ enum ALLOCATOR_BIT { ALLOCATOR_OTHERS = 1<<4 }; enum ARG_TYPE { - ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, - ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, - ARG_ALLOCATOR + ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_CULL_TIME, + ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_ALLOCATOR }; enum SORT_ORDER { SORT_ASC = 1, @@ -90,7 +86,6 @@ static regex_t pid_pattern; static regex_t tgid_pattern; static regex_t comm_pattern; static regex_t ts_nsec_pattern; -static regex_t free_ts_nsec_pattern; static struct block_list *list; static int list_size; static int max_size; @@ -181,24 +176,6 @@ static int compare_ts(const void *p1, const void *p2) return l1->ts_nsec < l2->ts_nsec ? -1 : 1; } -static int compare_free_ts(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; -} - -static int compare_release(const void *p1, const void *p2) -{ - const struct block_list *l1 = p1, *l2 = p2; - - if (!l1->free_ts_nsec && !l2->free_ts_nsec) - return 0; - if (l1->free_ts_nsec && l2->free_ts_nsec) - return 0; - return l1->free_ts_nsec ? 1 : -1; -} - static int compare_cull_condition(const void *p1, const void *p2) { if (cull == 0) @@ -211,8 +188,6 @@ static int compare_cull_condition(const void *p1, const void *p2) return compare_tgid(p1, p2); if ((cull & CULL_COMM) && compare_comm(p1, p2)) return compare_comm(p1, p2); - if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) - return compare_release(p1, p2); if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) return compare_allocator(p1, p2); return 0; @@ -366,24 +341,6 @@ static __u64 get_ts_nsec(char *buf) return ts_nsec; } -static __u64 get_free_ts_nsec(char *buf) -{ - __u64 free_ts_nsec; - char free_ts_nsec_str[FIELD_BUFF] = {0}; - char *endptr; - - search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); - errno = 0; - free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); - if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { - if (debug_on) - fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); - return -1; - } - - return free_ts_nsec; -} - static char *get_comm(char *buf) { char *comm_str = malloc(TASK_COMM_LEN); @@ -411,12 +368,8 @@ static int get_arg_type(const char *arg) return ARG_COMM; else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) return ARG_STACKTRACE; - else if (!strcmp(arg, "free") || !strcmp(arg, "f")) - return ARG_FREE; else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) return ARG_TXT; - else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) - return ARG_FREE_TS; else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) return ARG_ALLOC_TS; else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) @@ -471,13 +424,6 @@ static bool match_str_list(const char *str, char **list, int list_size) static bool is_need(char *buf) { - __u64 ts_nsec, free_ts_nsec; - - ts_nsec = get_ts_nsec(buf); - free_ts_nsec = get_free_ts_nsec(buf); - - if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) - return false; if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) return false; if ((filter & FILTER_TGID) && @@ -528,7 +474,6 @@ static bool add_list(char *buf, int len, char *ext_buf) if (*list[list_size].stacktrace == '\n') list[list_size].stacktrace++; list[list_size].ts_nsec = get_ts_nsec(buf); - list[list_size].free_ts_nsec = get_free_ts_nsec(buf); list[list_size].allocator = get_allocator(buf, ext_buf); list_size++; if (list_size % 1000 == 0) { @@ -554,8 +499,6 @@ static bool parse_cull_args(const char *arg_str) cull |= CULL_COMM; else if (arg_type == ARG_STACKTRACE) cull |= CULL_STACKTRACE; - else if (arg_type == ARG_FREE) - cull |= CULL_UNRELEASE; else if (arg_type == ARG_ALLOCATOR) cull |= CULL_ALLOCATOR; else { @@ -616,8 +559,6 @@ static bool parse_sort_args(const char *arg_str) sc.cmps[i] = compare_stacktrace; else if (arg_type == ARG_ALLOC_TS) sc.cmps[i] = compare_ts; - else if (arg_type == ARG_FREE_TS) - sc.cmps[i] = compare_free_ts; else if (arg_type == ARG_TXT) sc.cmps[i] = compare_txt; else if (arg_type == ARG_ALLOCATOR) @@ -679,8 +620,6 @@ static void usage(void) "-P\t\tSort by tgid.\n" "-n\t\tSort by task command name.\n" "-a\t\tSort by memory allocate time.\n" - "-r\t\tSort by memory release time.\n" - "-f\t\tFilter out the information of blocks whose memory has been released.\n" "-d\t\tPrint debug information.\n" "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" @@ -706,7 +645,7 @@ int main(int argc, char **argv) { 0, 0, 0, 0}, }; - while ((opt = getopt_long(argc, argv, "adfmnprstP", longopts, NULL)) != -1) + while ((opt = getopt_long(argc, argv, "admnpstP", longopts, NULL)) != -1) switch (opt) { case 'a': set_single_cmp(compare_ts, SORT_ASC); @@ -714,18 +653,12 @@ int main(int argc, char **argv) case 'd': debug_on = true; break; - case 'f': - filter = filter | FILTER_UNRELEASE; - break; case 'm': set_single_cmp(compare_page_num, SORT_DESC); break; case 'p': set_single_cmp(compare_pid, SORT_ASC); break; - case 'r': - set_single_cmp(compare_free_ts, SORT_ASC); - break; case 's': set_single_cmp(compare_stacktrace, SORT_ASC); break; @@ -800,10 +733,8 @@ int main(int argc, char **argv) goto out_tgid; if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) goto out_comm; - if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns")) goto out_ts; - if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) - goto out_free_ts; fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ @@ -864,9 +795,6 @@ int main(int argc, char **argv) fprintf(fout, ", "); print_allocator(fout, list[i].allocator); } - if (cull & CULL_UNRELEASE) - fprintf(fout, " (%s)", - list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); if (cull & CULL_STACKTRACE) fprintf(fout, ":\n%s", list[i].stacktrace); fprintf(fout, "\n"); @@ -880,8 +808,6 @@ out_free: free(buf); if (list) free(list); -out_free_ts: - regfree(&free_ts_nsec_pattern); out_ts: regfree(&ts_nsec_pattern); out_comm: -- cgit v1.2.3 From 63a150623a2bf94c9ed503719a3423675a3aa0d3 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:47 -0400 Subject: tools/mm: filter out timestamps for correct collation With the introduction of allocation timestamps being included in page_owner output, each record becomes unique due to the timestamp nanosecond granularity. Remove the check in add_list that tries to collate each record during processing as the memcmp() is just additional overhead at this point. Also keep the allocation timestamps, but allow collation to occur without consideration of the allocation timestamp except in the case were allocation timestamps are requested by the user (the -a option). Link: https://lkml.kernel.org/r/20231013190350.579407-4-audra@redhat.com Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Acked-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 9c93f3f4514f..7ddabcb3a073 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -203,6 +203,21 @@ static int compare_sort_condition(const void *p1, const void *p2) return cmp; } +static int remove_pattern(regex_t *pattern, char *buf, int len) +{ + regmatch_t pmatch[2]; + int err; + + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) + return len; + + memcpy(buf + pmatch[1].rm_so, + buf + pmatch[1].rm_eo, len - pmatch[1].rm_eo); + + return len - (pmatch[1].rm_eo - pmatch[1].rm_so); +} + static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) { int err, val_len; @@ -443,13 +458,6 @@ static bool is_need(char *buf) static bool add_list(char *buf, int len, char *ext_buf) { - if (list_size != 0 && - len == list[list_size-1].len && - memcmp(buf, list[list_size-1].txt, len) == 0) { - list[list_size-1].num++; - list[list_size-1].page_num += get_page_num(buf); - return true; - } if (list_size == max_size) { fprintf(stderr, "max_size too small??\n"); return false; @@ -465,6 +473,9 @@ static bool add_list(char *buf, int len, char *ext_buf) return false; } memcpy(list[list_size].txt, buf, len); + if (sc.cmps[0] != compare_ts) { + len = remove_pattern(&ts_nsec_pattern, list[list_size].txt, len); + } list[list_size].txt[len] = 0; list[list_size].len = len; list[list_size].num = 1; -- cgit v1.2.3 From c6d5e4901e00031650132aa30aa082a47c3796e5 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:48 -0400 Subject: tools/mm: fix the default case for page_owner_sort With the additional commands and timestamps added to the tool, the default case (-t) has been broken. Now that the allocation timestamps are saved outside of the txt field, allow us to properly sort the data by number of times the record has been seen. Furthermore prevent the misuse of the commandline arguments so only one compare option can be used. Link: https://lkml.kernel.org/r/20231013190350.579407-5-audra@redhat.com Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Acked-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 61 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 53 insertions(+), 8 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 7ddabcb3a073..5a260096ebaa 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -66,6 +66,16 @@ enum SORT_ORDER { SORT_ASC = 1, SORT_DESC = -1, }; +enum COMP_FLAG { + COMP_NO_FLAG = 0, + COMP_ALLOC = 1<<0, + COMP_PAGE_NUM = 1<<1, + COMP_PID = 1<<2, + COMP_STACK = 1<<3, + COMP_NUM = 1<<4, + COMP_TGID = 1<<5, + COMP_COMM = 1<<6 +}; struct filter_condition { pid_t *pids; pid_t *tgids; @@ -644,7 +654,7 @@ int main(int argc, char **argv) { FILE *fin, *fout; char *buf, *ext_buf; - int i, count; + int i, count, compare_flag; struct stat st; int opt; struct option longopts[] = { @@ -656,31 +666,33 @@ int main(int argc, char **argv) { 0, 0, 0, 0}, }; + compare_flag = COMP_NO_FLAG; + while ((opt = getopt_long(argc, argv, "admnpstP", longopts, NULL)) != -1) switch (opt) { case 'a': - set_single_cmp(compare_ts, SORT_ASC); + compare_flag |= COMP_ALLOC; break; case 'd': debug_on = true; break; case 'm': - set_single_cmp(compare_page_num, SORT_DESC); + compare_flag |= COMP_PAGE_NUM; break; case 'p': - set_single_cmp(compare_pid, SORT_ASC); + compare_flag |= COMP_PID; break; case 's': - set_single_cmp(compare_stacktrace, SORT_ASC); + compare_flag |= COMP_STACK; break; case 't': - set_single_cmp(compare_num, SORT_DESC); + compare_flag |= COMP_NUM; break; case 'P': - set_single_cmp(compare_tgid, SORT_ASC); + compare_flag |= COMP_TGID; break; case 'n': - set_single_cmp(compare_comm, SORT_ASC); + compare_flag |= COMP_COMM; break; case 1: filter = filter | FILTER_PID; @@ -728,6 +740,39 @@ int main(int argc, char **argv) exit(1); } + /* Only one compare option is allowed, yet we also want handle the + * default case were no option is provided, but we still want to + * match the behavior of the -t option (compare by number of times + * a record is seen + */ + switch (compare_flag) { + case COMP_ALLOC: + set_single_cmp(compare_ts, SORT_ASC); + break; + case COMP_PAGE_NUM: + set_single_cmp(compare_page_num, SORT_DESC); + break; + case COMP_PID: + set_single_cmp(compare_pid, SORT_ASC); + break; + case COMP_STACK: + set_single_cmp(compare_stacktrace, SORT_ASC); + break; + case COMP_NO_FLAG: + case COMP_NUM: + set_single_cmp(compare_num, SORT_DESC); + break; + case COMP_TGID: + set_single_cmp(compare_tgid, SORT_ASC); + break; + case COMP_COMM: + set_single_cmp(compare_comm, SORT_ASC); + break; + default: + usage(); + exit(1); + } + fin = fopen(argv[optind], "r"); fout = fopen(argv[optind + 1], "w"); if (!fin || !fout) { -- cgit v1.2.3 From d8ea435f071592d82479414e97e3c70bed204666 Mon Sep 17 00:00:00 2001 From: Audra Mitchell Date: Fri, 13 Oct 2023 15:03:49 -0400 Subject: tools/mm: update the usage output to be more organized Organize the usage options alphabetically and improve the description of some options. Also separate the more complicated cull options from the single use compare options. Link: https://lkml.kernel.org/r/20231013190350.579407-6-audra@redhat.com Signed-off-by: Audra Mitchell Acked-by: Rafael Aquini Acked-by: Vlastimil Babka Cc: Georgi Djakov Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index 5a260096ebaa..e1f264444342 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -634,19 +634,26 @@ static void print_allocator(FILE *out, int allocator) static void usage(void) { printf("Usage: ./page_owner_sort [OPTIONS] \n" - "-m\t\tSort by total memory.\n" - "-s\t\tSort by the stack trace.\n" - "-t\t\tSort by times (default).\n" - "-p\t\tSort by pid.\n" - "-P\t\tSort by tgid.\n" - "-n\t\tSort by task command name.\n" - "-a\t\tSort by memory allocate time.\n" - "-d\t\tPrint debug information.\n" - "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" - "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" - "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" - "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" - "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" + "-a\t\t\tSort by memory allocation time.\n" + "-m\t\t\tSort by total memory.\n" + "-n\t\t\tSort by task command name.\n" + "-p\t\t\tSort by pid.\n" + "-P\t\t\tSort by tgid.\n" + "-s\t\t\tSort by the stacktrace.\n" + "-t\t\t\tSort by number of times record is seen (default).\n\n" + "--pid \t\tSelect by pid. This selects the information" + " of\n\t\t\tblocks whose process ID numbers appear in .\n" + "--tgid \tSelect by tgid. This selects the information" + " of\n\t\t\tblocks whose Thread Group ID numbers appear in " + ".\n" + "--name \tSelect by command name. This selects the" + " information\n\t\t\tof blocks whose command name appears in" + " .\n" + "--cull \t\tCull by user-defined rules. is a " + "single\n\t\t\targument in the form of a comma-separated list " + "with some\n\t\t\tcommon fields predefined (pid, tgid, comm, " + "stacktrace, allocator)\n" + "--sort \t\tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" ); } -- cgit v1.2.3 From 4d4e41b682990b1dc5bba2bc313800340bf5c2d4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:53 +0000 Subject: mm/damon/sysfs-schemes: do not update tried regions more than one DAMON snapshot Patch series "mm/damon/sysfs-schemes: Do DAMOS tried regions update for only one apply interval". DAMOS tried regions update feature of DAMON sysfs interface is doing the update for one aggregation interval after the request is made. Since the per-scheme apply interval is supported, that behavior makes no much sense. That is, the tried regions directory will have regions from multiple DAMON monitoring results snapshots, or no region for apply intervals that much shorter than, or longer than the aggregation interval, respectively. Update the behavior to update the regions for each scheme for only its apply interval, and update the document. Since DAMOS apply interval is the aggregation by default, this change makes no visible behavioral difference to old users who don't explicitly set the apply intervals. Patches Sequence ---------------- The first two patches makes schemes of apply intervals that much shorter or longer than the aggregation interval to keep the maximum and minimum times for continuing the update. After the two patches, the update aligns with the each scheme's apply interval. Finally, the third patch updates the document to reflect the behavior. This patch (of 3): DAMON_SYSFS exposes every DAMON-found region that eligible for applying the scheme action for one aggregation interval. However, each DAMON-based operation scheme has its own apply interval. Hence, for a scheme that having its apply interval much smaller than the aggregation interval, DAMON_SYSFS will expose the scheme regions that applied to more than one DAMON monitoring results snapshots. Since the purpose of DAMON tried regions is exposing single snapshot, this makes no much sense. Track progress of each scheme's tried regions update and avoid the case. Link: https://lkml.kernel.org/r/20231012192256.33556-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231012192256.33556-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index a7d70b95c4dd..b07a5c544b34 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -113,11 +113,47 @@ static const struct kobj_type damon_sysfs_scheme_region_ktype = { * scheme regions directory */ +/* + * enum damos_sysfs_regions_upd_status - Represent DAMOS tried regions update + * status + * @DAMOS_TRIED_REGIONS_UPD_IDLE: Waiting for next request. + * @DAMOS_TRIED_REGIONS_UPD_STARTED: Update started. + * @DAMOS_TRIED_REGIONS_UPD_FINISHED: Update finished. + * + * Each DAMON-based operation scheme (&struct damos) has its own apply + * interval, and we need to expose the scheme tried regions based on only + * single snapshot. For this, we keep the tried regions update status for each + * scheme. The status becomes 'idle' at the beginning. + * + * Once the tried regions update request is received, the request handling + * start function (damon_sysfs_scheme_update_regions_start()) sets the status + * of all schemes as 'idle' again, and register ->before_damos_apply() and + * ->after_sampling() callbacks. + * + * Then, the first followup ->before_damos_apply() callback + * (damon_sysfs_before_damos_apply()) sets the status 'started'. The first + * ->after_sampling() callback (damon_sysfs_after_sampling()) after the call + * is called only after the scheme is completely applied + * to the given snapshot. Hence the callback knows the situation by showing + * 'started' status, and sets the status as 'finished'. Then, + * damon_sysfs_before_damos_apply() understands the situation by showing the + * 'finished' status and do nothing. + * + * Finally, the tried regions request handling finisher function + * (damon_sysfs_schemes_update_regions_stop()) unregisters the callbacks. + */ +enum damos_sysfs_regions_upd_status { + DAMOS_TRIED_REGIONS_UPD_IDLE, + DAMOS_TRIED_REGIONS_UPD_STARTED, + DAMOS_TRIED_REGIONS_UPD_FINISHED, +}; + struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; unsigned long total_bytes; + enum damos_sysfs_regions_upd_status upd_status; }; static struct damon_sysfs_scheme_regions * @@ -130,6 +166,7 @@ damon_sysfs_scheme_regions_alloc(void) INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; regions->total_bytes = 0; + regions->upd_status = DAMOS_TRIED_REGIONS_UPD_IDLE; return regions; } @@ -1777,6 +1814,10 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_FINISHED) + return 0; + if (sysfs_regions->upd_status == DAMOS_TRIED_REGIONS_UPD_IDLE) + sysfs_regions->upd_status = DAMOS_TRIED_REGIONS_UPD_STARTED; sysfs_regions->total_bytes += r->ar.end - r->ar.start; if (damos_regions_upd_total_bytes_only) return 0; @@ -1793,6 +1834,29 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; } +/* + * DAMON callback that called after each accesses sampling. While this + * callback is registered, damon_sysfs_lock should be held to ensure the + * regions directories exist. + */ +static int damon_sysfs_after_sampling(struct damon_ctx *ctx) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status == + DAMOS_TRIED_REGIONS_UPD_STARTED) + sysfs_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_FINISHED; + } + + return 0; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_clear_regions( struct damon_sysfs_schemes *sysfs_schemes, @@ -1816,6 +1880,16 @@ int damon_sysfs_schemes_clear_regions( return 0; } +static void damos_tried_regions_init_upd_status( + struct damon_sysfs_schemes *sysfs_schemes) +{ + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) + sysfs_schemes->schemes_arr[i]->tried_regions->upd_status = + DAMOS_TRIED_REGIONS_UPD_IDLE; +} + /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, @@ -1823,8 +1897,10 @@ int damon_sysfs_schemes_update_regions_start( { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_tried_regions_init_upd_status(sysfs_schemes); damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + ctx->callback.after_sampling = damon_sysfs_after_sampling; return 0; } @@ -1837,6 +1913,7 @@ int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) { damon_sysfs_schemes_for_damos_callback = NULL; ctx->callback.before_damos_apply = NULL; + ctx->callback.after_sampling = NULL; damon_sysfs_schemes_region_idx = 0; return 0; } -- cgit v1.2.3 From 76126332c7606ba25a4ae5db37145fd526985b45 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:54 +0000 Subject: mm/damon/sysfs: avoid empty scheme tried regions for large apply interval DAMON_SYSFS assumes all schemes will be applied for at least one DAMON monitoring results snapshot within one aggregation interval, or makes no sense to wait for it while DAMON is deactivated by the watermarks. That for deactivated status still makes sense, but the aggregation interval based assumption is invalid now because each scheme can has its own apply interval. For schemes having larger than the aggregation or watermarks check interval, DAMOS tried regions update request can be finished without the update. Avoid the case by explicitly checking the status of the schemes tried regions update and watermarks based DAMON deactivation. Link: https://lkml.kernel.org/r/20231012192256.33556-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 ++ mm/damon/sysfs-schemes.c | 16 ++++++++++++++++ mm/damon/sysfs.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 4 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index fd482a0639b4..5ff081226e28 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -49,6 +49,8 @@ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, struct damon_ctx *ctx, bool total_bytes_only); +bool damos_sysfs_regions_upd_done(void); + int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); int damon_sysfs_schemes_clear_regions( diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index b07a5c544b34..45bd0fd4a8b1 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1904,6 +1904,22 @@ int damon_sysfs_schemes_update_regions_start( return 0; } +bool damos_sysfs_regions_upd_done(void) +{ + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + struct damon_sysfs_scheme_regions *sysfs_regions; + int i; + + for (i = 0; i < sysfs_schemes->nr; i++) { + sysfs_regions = sysfs_schemes->schemes_arr[i]->tried_regions; + if (sysfs_regions->upd_status != + DAMOS_TRIED_REGIONS_UPD_FINISHED) + return false; + } + return true; +} + /* * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller * should unlock damon_sysfs_lock which held before diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f60e56150feb..f73dc88d2d19 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1336,12 +1336,13 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. - * @c: The DAMON context of the callback. + * @c: The DAMON context of the callback. + * @active: Whether @c is not deactivated due to watermarks. * * This function is periodically called back from the kdamond thread for @c. * Then, it checks if there is a waiting DAMON sysfs request and handles it. */ -static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) +static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active) { struct damon_sysfs_kdamond *kdamond; bool total_bytes_only = false; @@ -1373,6 +1374,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) goto keep_lock_out; } } else { + /* + * Continue regions updating if DAMON is till + * active and the update for all schemes is not + * finished. + */ + if (active && !damos_sysfs_regions_upd_done()) + goto keep_lock_out; err = damon_sysfs_upd_schemes_regions_stop(kdamond); damon_sysfs_schemes_regions_updating = false; } @@ -1392,6 +1400,24 @@ keep_lock_out: return err; } +static int damon_sysfs_after_wmarks_check(struct damon_ctx *c) +{ + /* + * after_wmarks_check() is called back while the context is deactivated + * by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, false); +} + +static int damon_sysfs_after_aggregation(struct damon_ctx *c) +{ + /* + * after_aggregation() is called back only while the context is not + * deactivated by watermarks. + */ + return damon_sysfs_cmd_request_callback(c, true); +} + static struct damon_ctx *damon_sysfs_build_ctx( struct damon_sysfs_context *sys_ctx) { @@ -1407,8 +1433,8 @@ static struct damon_ctx *damon_sysfs_build_ctx( return ERR_PTR(err); } - ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback; - ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback; + ctx->callback.after_wmarks_check = damon_sysfs_after_wmarks_check; + ctx->callback.after_aggregation = damon_sysfs_after_aggregation; ctx->callback.before_terminate = damon_sysfs_before_terminate; return ctx; } -- cgit v1.2.3 From bc17ea26a8db89f6604d1386e08cdfc78a70f4f1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 12 Oct 2023 19:22:55 +0000 Subject: Docs/admin-guide/mm/damon/usage: update for tried regions update time interval The documentation says DAMOS tried regions update feature of DAMON sysfs interface is doing the update for one aggregation interval after the request is made. Since the introduction of the per-scheme apply interval, that behavior makes no much sense. Hence the implementation has changed to update the regions for each scheme for only its apply interval. Further update the document to reflect the real behavior. Link: https://lkml.kernel.org/r/20231012192256.33556-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 8507a6e45d86..da94feb97ed1 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -432,9 +432,9 @@ that reading it returns the total size of the scheme tried regions, and creates directories named integer starting from ``0`` under this directory. Each directory contains files exposing detailed information about each of the memory region that the corresponding scheme's ``action`` has tried to be applied under -this directory, during next :ref:`aggregation interval -`. The information includes address range, -``nr_accesses``, and ``age`` of the region. +this directory, during next :ref:`apply interval ` of the +corresponding scheme. The information includes address range, ``nr_accesses``, +and ``age`` of the region. Writing ``update_schemes_tried_bytes`` to the relevant ``kdamonds//state`` file will only update the ``total_bytes`` file, and will not create the -- cgit v1.2.3 From e8e17ee90eaf650c855adb0a3e5e965fd6692ff1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:28 +0100 Subject: mm: drop the assumption that VM_SHARED always implies writable Patch series "permit write-sealed memfd read-only shared mappings", v4. The man page for fcntl() describing memfd file seals states the following about F_SEAL_WRITE:- Furthermore, trying to create new shared, writable memory-mappings via mmap(2) will also fail with EPERM. With emphasis on 'writable'. In turns out in fact that currently the kernel simply disallows all new shared memory mappings for a memfd with F_SEAL_WRITE applied, rendering this documentation inaccurate. This matters because users are therefore unable to obtain a shared mapping to a memfd after write sealing altogether, which limits their usefulness. This was reported in the discussion thread [1] originating from a bug report [2]. This is a product of both using the struct address_space->i_mmap_writable atomic counter to determine whether writing may be permitted, and the kernel adjusting this counter when any VM_SHARED mapping is performed and more generally implicitly assuming VM_SHARED implies writable. It seems sensible that we should only update this mapping if VM_MAYWRITE is specified, i.e. whether it is possible that this mapping could at any point be written to. If we do so then all we need to do to permit write seals to function as documented is to clear VM_MAYWRITE when mapping read-only. It turns out this functionality already exists for F_SEAL_FUTURE_WRITE - we can therefore simply adapt this logic to do the same for F_SEAL_WRITE. We then hit a chicken and egg situation in mmap_region() where the check for VM_MAYWRITE occurs before we are able to clear this flag. To work around this, perform this check after we invoke call_mmap(), with careful consideration of error paths. Thanks to Andy Lutomirski for the suggestion! [1]:https://lore.kernel.org/all/20230324133646.16101dfa666f253c4715d965@linux-foundation.org/ [2]:https://bugzilla.kernel.org/show_bug.cgi?id=217238 This patch (of 3): There is a general assumption that VMAs with the VM_SHARED flag set are writable. If the VM_MAYWRITE flag is not set, then this is simply not the case. Update those checks which affect the struct address_space->i_mmap_writable field to explicitly test for this by introducing [vma_]is_shared_maywrite() helper functions. This remains entirely conservative, as the lack of VM_MAYWRITE guarantees that the VMA cannot be written to. Link: https://lkml.kernel.org/r/cover.1697116581.git.lstoakes@gmail.com Link: https://lkml.kernel.org/r/d978aefefa83ec42d18dfa964ad180dbcde34795.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Suggested-by: Andy Lutomirski Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/fs.h | 4 ++-- include/linux/mm.h | 11 +++++++++++ kernel/fork.c | 2 +- mm/filemap.c | 2 +- mm/madvise.c | 2 +- mm/mmap.c | 12 ++++++------ 6 files changed, 22 insertions(+), 11 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index fd539c9fef8e..5265186da0e2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -454,7 +454,7 @@ extern const struct address_space_operations empty_aops; * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. - * @i_mmap_writable: Number of VM_SHARED mappings. + * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. @@ -557,7 +557,7 @@ static inline int mapping_mapped(struct address_space *mapping) /* * Might pages of this file have been modified in userspace? - * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap + * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * diff --git a/include/linux/mm.h b/include/linux/mm.h index 1bddb151cd5c..c3b5749ede9d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -937,6 +937,17 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & VM_ACCESS_FLAGS; } +static inline bool is_shared_maywrite(vm_flags_t vm_flags) +{ + return (vm_flags & (VM_SHARED | VM_MAYWRITE)) == + (VM_SHARED | VM_MAYWRITE); +} + +static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma) +{ + return is_shared_maywrite(vma->vm_flags); +} + static inline struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max) { diff --git a/kernel/fork.c b/kernel/fork.c index e45a4457ba83..1e6c656e0857 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -733,7 +733,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, get_file(file); i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ diff --git a/mm/filemap.c b/mm/filemap.c index 9ef49255f1a5..9710f43a89ac 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3618,7 +3618,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) */ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) { - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + if (vma_is_shared_maywrite(vma)) return -EINVAL; return generic_file_mmap(file, vma); } diff --git a/mm/madvise.c b/mm/madvise.c index a6b48d4796ba..cf4d694280e9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -970,7 +970,7 @@ static long madvise_remove(struct vm_area_struct *vma, return -EINVAL; } - if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE)) + if (!vma_is_shared_maywrite(vma)) return -EACCES; offset = (loff_t)(start - vma->vm_start) diff --git a/mm/mmap.c b/mm/mmap.c index 3ea52451623b..0041e3631f6c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -107,7 +107,7 @@ void vma_set_page_prot(struct vm_area_struct *vma) static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct file *file, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -384,7 +384,7 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); @@ -2846,7 +2846,7 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (vm_flags & VM_SHARED) { + if (is_shared_maywrite(vm_flags)) { error = mapping_map_writable(file->f_mapping); if (error) goto free_vma; @@ -2920,7 +2920,7 @@ cannot_expand: mm->map_count++; if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); - if (vma->vm_flags & VM_SHARED) + if (vma_is_shared_maywrite(vma)) mapping_allow_writable(vma->vm_file->f_mapping); flush_dcache_mmap_lock(vma->vm_file->f_mapping); @@ -2937,7 +2937,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && vm_flags & VM_SHARED) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2985,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && (vm_flags & VM_SHARED)) + if (file && is_shared_maywrite(vm_flags)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit v1.2.3 From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:29 +0100 Subject: mm: update memfd seal write check to include F_SEAL_WRITE The seal_check_future_write() function is called by shmem_mmap() or hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd sealed this way. The F_SEAL_WRITE flag is not checked here, as that is handled via the mapping->i_mmap_writable mechanism and so any attempt at a mapping would fail before this could be run. However we intend to change this, meaning this check can be performed for F_SEAL_WRITE mappings also. The logic here is equally applicable to both flags, so update this function to accommodate both and rename it accordingly. Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/mm.h | 15 ++++++++------- mm/shmem.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 926d01c493fb..85cd418d2340 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -135,7 +135,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index c3b5749ede9d..86e040e886e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4076,25 +4076,26 @@ static inline void mem_dump_obj(void *object) {} #endif /** - * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and + * handle them. * @seals: the seals to check * @vma: the vma to operate on * - * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on - * the vma flags. Return 0 if check pass, or <0 for errors. + * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper + * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. */ -static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & F_SEAL_FUTURE_WRITE) { + if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. + * write seals are active. */ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) return -EPERM; /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as * MAP_SHARED and read-only, take care to not allow mprotect to * revert protections on such mappings. Do this only for shared * mappings. For private mappings, don't need to mask diff --git a/mm/shmem.c b/mm/shmem.c index 61b170324e5c..bcbe9dbb9f5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2378,7 +2378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) struct shmem_inode_info *info = SHMEM_I(inode); int ret; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; -- cgit v1.2.3 From 158978945f3173b8c1a88f8c5684a629736a57ac Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:30 +0100 Subject: mm: perform the mapping_map_writable() check after call_mmap() In order for a F_SEAL_WRITE sealed memfd mapping to have an opportunity to clear VM_MAYWRITE, we must be able to invoke the appropriate vm_ops->mmap() handler to do so. We would otherwise fail the mapping_map_writable() check before we had the opportunity to avoid it. This patch moves this check after the call_mmap() invocation. Only memfd actively denies write access causing a potential failure here (in memfd_add_seals()), so there should be no impact on non-memfd cases. This patch makes the userland-visible change that MAP_SHARED, PROT_READ mappings of an F_SEAL_WRITE sealed memfd mapping will now succeed. There is a delicate situation with cleanup paths assuming that a writable mapping must have occurred in circumstances where it may now not have. In order to ensure we do not accidentally mark a writable file unwritable by mistake, we explicitly track whether we have a writable mapping and unmap only if we do. [lstoakes@gmail.com: do not set writable_file_mapping in inappropriate case] Link: https://lkml.kernel.org/r/c9eb4cc6-7db4-4c2b-838d-43a0b319a4f0@lucifer.local Link: https://bugzilla.kernel.org/show_bug.cgi?id=217238 Link: https://lkml.kernel.org/r/55e413d20678a1bb4c7cce889062bbb07b0df892.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/mmap.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 0041e3631f6c..8b57e42fd980 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2752,6 +2752,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long charged = 0; unsigned long end = addr + len; unsigned long merge_start = addr, merge_end = end; + bool writable_file_mapping = false; pgoff_t vm_pgoff; int error; VMA_ITERATOR(vmi, mm, addr); @@ -2846,17 +2847,19 @@ cannot_expand: vma->vm_pgoff = pgoff; if (file) { - if (is_shared_maywrite(vm_flags)) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - vma->vm_file = get_file(file); error = call_mmap(file, vma); if (error) goto unmap_and_free_vma; + if (vma_is_shared_maywrite(vma)) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto close_and_free_vma; + + writable_file_mapping = true; + } + /* * Expansion is handled above, merging is handled below. * Drivers should not alter the address of the VMA. @@ -2937,7 +2940,7 @@ cannot_expand: /* Once vma denies write, undo our temporary denial count */ unmap_writable: - if (file && is_shared_maywrite(vm_flags)) + if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; ksm_add_vma(vma); @@ -2985,7 +2988,7 @@ unmap_and_free_vma: unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, vma->vm_end, vma->vm_end, true); } - if (file && is_shared_maywrite(vm_flags)) + if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); -- cgit v1.2.3 From fa8c4f9a665bd0cf5a475327aea4fd179e896216 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 11 Aug 2023 17:08:19 +0800 Subject: mm: fix draining remote pageset If there is no memory allocation/freeing in the PCP (Per-CPU Pageset) of a remote zone (zone in remote NUMA node) after some time (3 seconds for now), the pages of the PCP of the remote zone will be drained to avoid memory wastage. This behavior was introduced in the commit 4ae7c03943fc ("[PATCH] Periodically drain non local pagesets") and the commit 4037d452202e ("Move remote node draining out of slab allocators") But, after the commit 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8"), the vmstat updater worker which is used to drain the PCP of remote zones may not be re-queued when we are waiting for the timeout (pcp->expire != 0) if there are no vmstat changes on this CPU, for example, when the CPU goes idle or runs user space only workloads. This may cause the pages of a remote zone be kept in PCP of this CPU for long time. So that, the page reclaiming of the remote zone may be triggered prematurely. This isn't a severe problem in practice, because the PCP of the remote zone will be drained if some memory are allocated/freed again on this CPU. And, the PCP will eventually be drained during the direct reclaiming if necessary. Anyway, the problem still deserves a fix via guaranteeing that the vmstat updater worker will always be re-queued when we are waiting for the timeout. In effect, this restores the original behavior before the commit 7cc36bbddde5. We can reproduce the bug via allocating/freeing pages from a remote zone then go idle as follows. And the patch can fix it. - Run some workloads, use `numactl` to bind CPU to node 0 and memory to node 1. So the PCP of the CPU on node 0 for zone on node 1 will be filled. - After workloads finish, idle for 60s - Check /proc/zoneinfo With the original kernel, the number of pages in the PCP of the CPU on node 0 for zone on node 1 is non-zero after idle. With the patched kernel, it becomes 0 after idle. That is, we avoid to keep pages in the remote PCP during idle. Link: https://lkml.kernel.org/r/20231007062356.187621-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20230811090819.60845-1-ying.huang@intel.com Fixes: 7cc36bbddde5 ("vmstat: on-demand vmstat workers V8") Signed-off-by: "Huang, Ying" Reviewed-by: Christoph Lameter Cc: Mel Gorman Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmstat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 894e4c88d241..88ea95d4221c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -857,8 +857,10 @@ static int refresh_cpu_vm_stats(bool do_pagesets) continue; } - if (__this_cpu_dec_return(pcp->expire)) + if (__this_cpu_dec_return(pcp->expire)) { + changes++; continue; + } if (__this_cpu_read(pcp->count)) { drain_zone_pages(zone, this_cpu_ptr(pcp)); -- cgit v1.2.3 From d2cf88c27f51361dfe2982e510a444411d2fc78a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:03 -0700 Subject: hugetlb: optimize update_and_free_pages_bulk to avoid lock cycles MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Batch hugetlb vmemmap modification operations", v8. When hugetlb vmemmap optimization was introduced, the overhead of enabling the option was measured as described in commit 426e5c429d16 [1]. The summary states that allocating a hugetlb page should be ~2x slower with optimization and freeing a hugetlb page should be ~2-3x slower. Such overhead was deemed an acceptable trade off for the memory savings obtained by freeing vmemmap pages. It was recently reported that the overhead associated with enabling vmemmap optimization could be as high as 190x for hugetlb page allocations. Yes, 190x! Some actual numbers from other environments are: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.119s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.477s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m28.973s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m36.748s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.463s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m2.931s Unmodified next-20230824, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 2m27.609s time echo 0 > .../hugepages-2048kB/nr_hugepages real 2m29.924s In the VM environment, the slowdown of enabling hugetlb vmemmap optimization resulted in allocation times being 61x slower. A quick profile showed that the vast majority of this overhead was due to TLB flushing. Each time we modify the kernel pagetable we need to flush the TLB. For each hugetlb that is optimized, there could be potentially two TLB flushes performed. One for the vmemmap pages associated with the hugetlb page, and potentially another one if the vmemmap pages are mapped at the PMD level and must be split. The TLB flushes required for the kernel pagetable, result in a broadcast IPI with each CPU having to flush a range of pages, or do a global flush if a threshold is exceeded. So, the flush time increases with the number of CPUs. In addition, in virtual environments the broadcast IPI can’t be accelerated by hypervisor hardware and leads to traps that need to wakeup/IPI all vCPUs which is very expensive. Because of this the slowdown in virtual environments is even worse than bare metal as the number of vCPUS/CPUs is increased. The following series attempts to reduce amount of time spent in TLB flushing. The idea is to batch the vmemmap modification operations for multiple hugetlb pages. Instead of doing one or two TLB flushes for each page, we do two TLB flushes for each batch of pages. One flush after splitting pages mapped at the PMD level, and another after remapping vmemmap associated with all hugetlb pages. Results of such batching are as follows: Bare Metal 8 socket Intel(R) Xeon(R) CPU E7-8895 ------------------------------------------------ next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m4.719s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m4.245s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 500000 > .../hugepages-2048kB/nr_hugepages real 0m7.267s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m13.199s VM with 252 vcpus on host with 2 socket AMD EPYC 7J13 Milan ----------------------------------------------------------- next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 0 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m2.715s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m3.186s next-20230824 + Batching patches, vm.hugetlb_optimize_vmemmap = 1 time echo 524288 > .../hugepages-2048kB/nr_hugepages real 0m4.799s time echo 0 > .../hugepages-2048kB/nr_hugepages real 0m5.273s With batching, results are back in the 2-3x slowdown range. This patch (of 8): update_and_free_pages_bulk is designed to free a list of hugetlb pages back to their associated lower level allocators. This may require allocating vmemmmap pages associated with each hugetlb page. The hugetlb page destructor must be changed before pages are freed to lower level allocators. However, the destructor must be changed under the hugetlb lock. This means there is potentially one lock cycle per page. Minimize the number of lock cycles in update_and_free_pages_bulk by: 1) allocating necessary vmemmap for all hugetlb pages on the list 2) take hugetlb lock and clear destructor for all pages on the list 3) free all pages on list back to low level allocators Link: https://lkml.kernel.org/r/20231019023113.345257-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20231019023113.345257-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Acked-by: James Houghton Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da6f85b7db88..b839080a2a6b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1862,7 +1862,46 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct folio *folio, *t_folio; + bool clear_dtor = false; + /* + * First allocate required vmemmmap (if necessary) for all folios on + * list. If vmemmap can not be allocated, we can not free folio to + * lower level allocator, so add back as hugetlb surplus page. + * add_hugetlb_folio() removes the page from THIS list. + * Use clear_dtor to note if vmemmap was successfully allocated for + * ANY page on the list. + */ + list_for_each_entry_safe(folio, t_folio, list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (hugetlb_vmemmap_restore(h, &folio->page)) { + spin_lock_irq(&hugetlb_lock); + add_hugetlb_folio(h, folio, true); + spin_unlock_irq(&hugetlb_lock); + } else + clear_dtor = true; + } + } + + /* + * If vmemmmap allocation was performed on any folio above, take lock + * to clear destructor of all folios on list. This avoids the need to + * lock/unlock for each individual folio. + * The assumption is vmemmap allocation was performed on all or none + * of the folios on the list. This is true expect in VERY rare cases. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + list_for_each_entry(folio, list, lru) + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + + /* + * Free folios back to low level allocators. vmemmap and destructors + * were taken care of above, so update_and_free_hugetlb_folio will + * not need to take hugetlb lock. + */ list_for_each_entry_safe(folio, t_folio, list, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); -- cgit v1.2.3 From d67e32f26713c35669e343edfdce6fa97a7d6bbc Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:04 -0700 Subject: hugetlb: restructure pool allocations Allocation of a hugetlb page for the hugetlb pool is done by the routine alloc_pool_huge_page. This routine will allocate contiguous pages from a low level allocator, prep the pages for usage as a hugetlb page and then add the resulting hugetlb page to the pool. In the 'prep' stage, optional vmemmap optimization is done. For performance reasons we want to perform vmemmap optimization on multiple hugetlb pages at once. To do this, restructure the hugetlb pool allocation code such that vmemmap optimization can be isolated and later batched. The code to allocate hugetlb pages from bootmem was also modified to allow batching. No functional changes, only code restructure. Link: https://lkml.kernel.org/r/20231019023113.345257-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Tested-by: Sergey Senozhatsky Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 141 insertions(+), 39 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b839080a2a6b..559f7c71c596 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1996,16 +1996,21 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) { folio_set_hugetlb(folio); - hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); } +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) +{ + init_new_hugetlb_folio(h, folio); + hugetlb_vmemmap_optimize(h, &folio->page); +} + static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { __prep_new_hugetlb_folio(h, folio); @@ -2202,16 +2207,9 @@ retry: return page_folio(page); } -/* - * Common helper to allocate a fresh hugetlb page. All specific allocators - * should use this function to get new hugetlb pages - * - * Note that returned page is 'frozen': ref count of head page and all tail - * pages is zero. - */ -static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) +static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; bool retry = false; @@ -2224,6 +2222,7 @@ retry: nid, nmask, node_alloc_noretry); if (!folio) return NULL; + if (hstate_is_gigantic(h)) { if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* @@ -2238,32 +2237,81 @@ retry: return NULL; } } - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); return folio; } +static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) +{ + struct folio *folio; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (folio) + init_new_hugetlb_folio(h, folio); + return folio; +} + /* - * Allocates a fresh page to the hugetlb allocator pool in the node interleaved - * manner. + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ -static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, - nodemask_t *node_alloc_noretry) +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask, + nodemask_t *node_alloc_noretry) { struct folio *folio; - int nr_nodes, node; + + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, + node_alloc_noretry); + if (!folio) + return NULL; + + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + return folio; +} + +static void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + +/* + * Allocates a fresh hugetlb page in a node interleaved manner. The page + * will later be added to the appropriate hugetlb pool. + */ +static struct folio *alloc_pool_huge_folio(struct hstate *h, + nodemask_t *nodes_allowed, + nodemask_t *node_alloc_noretry) +{ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + int nr_nodes, node; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + struct folio *folio; + + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); - if (folio) { - free_huge_folio(folio); /* free it into the hugepage allocator */ - return 1; - } + if (folio) + return folio; } - return 0; + return NULL; } /* @@ -3302,25 +3350,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, */ static void __init gather_bootmem_prealloc(void) { + LIST_HEAD(folio_list); struct huge_bootmem_page *m; + struct hstate *h = NULL, *prev_h = NULL; list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); struct folio *folio = (void *)page; - struct hstate *h = m->hstate; + + h = m->hstate; + /* + * It is possible to have multiple huge page sizes (hstates) + * in this list. If so, process each size separately. + */ + if (h != prev_h && prev_h != NULL) + prep_and_add_allocated_folios(prev_h, &folio_list); + prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); WARN_ON(folio_ref_count(folio) != 1); hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - prep_new_hugetlb_folio(h, folio, folio_nid(folio)); + __prep_new_hugetlb_folio(h, folio); /* If HVO fails, initialize all tail struct pages */ if (!HPageVmemmapOptimized(&folio->page)) hugetlb_folio_init_tail_vmemmap(folio, HUGETLB_VMEMMAP_RESERVE_PAGES, pages_per_huge_page(h)); - free_huge_folio(folio); /* add to the hugepage allocator */ + list_add(&folio->lru, &folio_list); /* * We need to restore the 'stolen' pages to totalram_pages @@ -3330,6 +3388,8 @@ static void __init gather_bootmem_prealloc(void) adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } + + prep_and_add_allocated_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) @@ -3363,9 +3423,22 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) h->max_huge_pages_node[nid] = i; } +/* + * NOTE: this routine is called in different contexts for gigantic and + * non-gigantic pages. + * - For gigantic pages, this is called early in the boot process and + * pages are allocated from memblock allocated or something similar. + * Gigantic pages are actually added to pools later with the routine + * gather_bootmem_prealloc. + * - For non-gigantic pages, this is called later in the boot process after + * all of mm is up and functional. Pages are allocated from buddy and + * then added to hugetlb pools. + */ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; + struct folio *folio; + LIST_HEAD(folio_list); nodemask_t *node_alloc_noretry; bool node_specific_alloc = false; @@ -3407,14 +3480,25 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { + /* + * gigantic pages not added to list as they are not + * added to pools now. + */ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; - } else if (!alloc_pool_huge_page(h, - &node_states[N_MEMORY], - node_alloc_noretry)) - break; + } else { + folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY], + node_alloc_noretry); + if (!folio) + break; + list_add(&folio->lru, &folio_list); + } cond_resched(); } + + /* list will be empty if hstate_is_gigantic */ + prep_and_add_allocated_folios(h, &folio_list); + if (i < h->max_huge_pages) { char buf[32]; @@ -3548,7 +3632,9 @@ found: static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { - unsigned long min_count, ret; + unsigned long min_count; + unsigned long allocated; + struct folio *folio; LIST_HEAD(page_list); NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL); @@ -3625,7 +3711,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, break; } - while (count > persistent_huge_pages(h)) { + allocated = 0; + while (count > (persistent_huge_pages(h) + allocated)) { /* * If this allocation races such that we no longer need the * page, free_huge_folio will handle it by freeing the page @@ -3636,15 +3723,32 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_pool_huge_page(h, nodes_allowed, + folio = alloc_pool_huge_folio(h, nodes_allowed, node_alloc_noretry); - spin_lock_irq(&hugetlb_lock); - if (!ret) + if (!folio) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + list_add(&folio->lru, &page_list); + allocated++; /* Bail for signals. Probably ctrl-c from user */ - if (signal_pending(current)) + if (signal_pending(current)) { + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); goto out; + } + + spin_lock_irq(&hugetlb_lock); + } + + /* Add allocated pages to the pool */ + if (!list_empty(&page_list)) { + spin_unlock_irq(&hugetlb_lock); + prep_and_add_allocated_folios(h, &page_list); + spin_lock_irq(&hugetlb_lock); } /* @@ -3670,8 +3774,6 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, * Collect pages to be removed on list without dropping lock */ while (min_count < persistent_huge_pages(h)) { - struct folio *folio; - folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0); if (!folio) break; -- cgit v1.2.3 From 79359d6d24df2f304abbf6f137b01d2b76175ce3 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:05 -0700 Subject: hugetlb: perform vmemmap optimization on a list of pages When adding hugetlb pages to the pool, we first create a list of the allocated pages before adding to the pool. Pass this list of pages to a new routine hugetlb_vmemmap_optimize_folios() for vmemmap optimization. Due to significant differences in vmemmmap initialization for bootmem allocated hugetlb pages, a new routine prep_and_add_bootmem_folios is created. We also modify the routine vmemmap_should_optimize() to check for pages that are already optimized. There are code paths that might request vmemmap optimization twice and we want to make sure this is not attempted. Link: https://lkml.kernel.org/r/20231019023113.345257-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++++++++++-------- mm/hugetlb_vmemmap.c | 11 +++++++++++ mm/hugetlb_vmemmap.h | 5 +++++ 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 559f7c71c596..8b171f866d0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2282,6 +2282,9 @@ static void prep_and_add_allocated_folios(struct hstate *h, unsigned long flags; struct folio *folio, *tmp_f; + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + /* Add all new pool pages to free lists in one lock cycle */ spin_lock_irqsave(&hugetlb_lock, flags); list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { @@ -3344,6 +3347,35 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, prep_compound_head((struct page *)folio, huge_page_order(h)); } +static void __init prep_and_add_bootmem_folios(struct hstate *h, + struct list_head *folio_list) +{ + unsigned long flags; + struct folio *folio, *tmp_f; + + /* Send list for bulk vmemmap optimization processing */ + hugetlb_vmemmap_optimize_folios(h, folio_list); + + /* Add all new pool pages to free lists in one lock cycle */ + spin_lock_irqsave(&hugetlb_lock, flags); + list_for_each_entry_safe(folio, tmp_f, folio_list, lru) { + if (!folio_test_hugetlb_vmemmap_optimized(folio)) { + /* + * If HVO fails, initialize all tail struct pages + * We do not worry about potential long lock hold + * time as this is early in boot and there should + * be no contention. + */ + hugetlb_folio_init_tail_vmemmap(folio, + HUGETLB_VMEMMAP_RESERVE_PAGES, + pages_per_huge_page(h)); + } + __prep_account_new_huge_page(h, folio_nid(folio)); + enqueue_hugetlb_folio(h, folio); + } + spin_unlock_irqrestore(&hugetlb_lock, flags); +} + /* * Put bootmem huge pages into the standard lists after mem_map is up. * Note: This only applies to gigantic (order > MAX_ORDER) pages. @@ -3364,7 +3396,7 @@ static void __init gather_bootmem_prealloc(void) * in this list. If so, process each size separately. */ if (h != prev_h && prev_h != NULL) - prep_and_add_allocated_folios(prev_h, &folio_list); + prep_and_add_bootmem_folios(prev_h, &folio_list); prev_h = h; VM_BUG_ON(!hstate_is_gigantic(h)); @@ -3372,12 +3404,7 @@ static void __init gather_bootmem_prealloc(void) hugetlb_folio_init_vmemmap(folio, h, HUGETLB_VMEMMAP_RESERVE_PAGES); - __prep_new_hugetlb_folio(h, folio); - /* If HVO fails, initialize all tail struct pages */ - if (!HPageVmemmapOptimized(&folio->page)) - hugetlb_folio_init_tail_vmemmap(folio, - HUGETLB_VMEMMAP_RESERVE_PAGES, - pages_per_huge_page(h)); + init_new_hugetlb_folio(h, folio); list_add(&folio->lru, &folio_list); /* @@ -3389,7 +3416,7 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } - prep_and_add_allocated_folios(h, &folio_list); + prep_and_add_bootmem_folios(h, &folio_list); } static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 76682d1d79a7..4558b814ffab 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -483,6 +483,9 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { + if (HPageVmemmapOptimized((struct page *)head)) + return false; + if (!READ_ONCE(vmemmap_optimize_enabled)) return false; @@ -572,6 +575,14 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) SetHPageVmemmapOptimized(head); } +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ + struct folio *folio; + + list_for_each_entry(folio, folio_list, lru) + hugetlb_vmemmap_optimize(h, &folio->page); +} + static struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 4573899855d7..c512e388dbb4 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -20,6 +20,7 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) { @@ -48,6 +49,10 @@ static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page { } +static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) +{ +} + static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h) { return 0; -- cgit v1.2.3 From cfb8c75099dbf152db1b075eead5029c5a30ce51 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:06 -0700 Subject: hugetlb: perform vmemmap restoration on a list of pages The routine update_and_free_pages_bulk already performs vmemmap restoration on the list of hugetlb pages in a separate step. In preparation for more functionality to be added in this step, create a new routine hugetlb_vmemmap_restore_folios() that will restore vmemmap for a list of folios. This new routine must provide sufficient feedback about errors and actual restoration performed so that update_and_free_pages_bulk can perform optimally. Special care must be taken when encountering an error from hugetlb_vmemmap_restore_folios. We want to continue making as much forward progress as possible. A new routine bulk_vmemmap_restore_error handles this specific situation. Link: https://lkml.kernel.org/r/20231019023113.345257-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb.c | 99 +++++++++++++++++++++++++++++++++++++--------------- mm/hugetlb_vmemmap.c | 38 ++++++++++++++++++++ mm/hugetlb_vmemmap.h | 11 ++++++ 3 files changed, 120 insertions(+), 28 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b171f866d0a..cf834bb7f820 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,50 +1859,93 @@ static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, schedule_work(&free_hpage_work); } -static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) +static void bulk_vmemmap_restore_error(struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; - bool clear_dtor = false; - /* - * First allocate required vmemmmap (if necessary) for all folios on - * list. If vmemmap can not be allocated, we can not free folio to - * lower level allocator, so add back as hugetlb surplus page. - * add_hugetlb_folio() removes the page from THIS list. - * Use clear_dtor to note if vmemmap was successfully allocated for - * ANY page on the list. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { - if (folio_test_hugetlb_vmemmap_optimized(folio)) { + if (!list_empty(non_hvo_folios)) { + /* + * Free any restored hugetlb pages so that restore of the + * entire list can be retried. + * The idea is that in the common case of ENOMEM errors freeing + * hugetlb pages with vmemmap we will free up memory so that we + * can allocate vmemmap for more hugetlb pages. + */ + list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + } + } else { + /* + * In the case where there are no folios which can be + * immediately freed, we loop through the list trying to restore + * vmemmap individually in the hope that someone elsewhere may + * have done something to cause success (such as freeing some + * memory). If unable to restore a hugetlb page, the hugetlb + * page is made a surplus page and removed from the list. + * If are able to restore vmemmap and free one hugetlb page, we + * quit processing the list to retry the bulk operation. + */ + list_for_each_entry_safe(folio, t_folio, folio_list, lru) if (hugetlb_vmemmap_restore(h, &folio->page)) { + list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); - } else - clear_dtor = true; - } + } else { + list_del(&folio->lru); + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + update_and_free_hugetlb_folio(h, folio, false); + cond_resched(); + break; + } } +} + +static void update_and_free_pages_bulk(struct hstate *h, + struct list_head *folio_list) +{ + long ret; + struct folio *folio, *t_folio; + LIST_HEAD(non_hvo_folios); /* - * If vmemmmap allocation was performed on any folio above, take lock - * to clear destructor of all folios on list. This avoids the need to - * lock/unlock for each individual folio. - * The assumption is vmemmap allocation was performed on all or none - * of the folios on the list. This is true expect in VERY rare cases. + * First allocate required vmemmmap (if necessary) for all folios. + * Carefully handle errors and free up any available hugetlb pages + * in an effort to make forward progress. */ - if (clear_dtor) { +retry: + ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios); + if (ret < 0) { + bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios); + goto retry; + } + + /* + * At this point, list should be empty, ret should be >= 0 and there + * should only be pages on the non_hvo_folios list. + * Do note that the non_hvo_folios list could be empty. + * Without HVO enabled, ret will be 0 and there is no need to call + * __clear_hugetlb_destructor as this was done previously. + */ + VM_WARN_ON(!list_empty(folio_list)); + VM_WARN_ON(ret < 0); + if (!list_empty(&non_hvo_folios) && ret) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(folio, list, lru) + list_for_each_entry(folio, &non_hvo_folios, lru) __clear_hugetlb_destructor(h, folio); spin_unlock_irq(&hugetlb_lock); } - /* - * Free folios back to low level allocators. vmemmap and destructors - * were taken care of above, so update_and_free_hugetlb_folio will - * not need to take hugetlb lock. - */ - list_for_each_entry_safe(folio, t_folio, list, lru) { + list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) { update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4558b814ffab..77f44b81ff01 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -480,6 +480,44 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) return ret; } +/** + * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. + * @h: hstate. + * @folio_list: list of folios. + * @non_hvo_folios: Output list of folios for which vmemmap exists. + * + * Return: number of folios for which vmemmap was restored, or an error code + * if an error was encountered restoring vmemmap for a folio. + * Folios that have vmemmap are moved to the non_hvo_folios + * list. Processing of entries stops when the first error is + * encountered. The folio that experienced the error and all + * non-processed folios will remain on folio_list. + */ +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + struct folio *folio, *t_folio; + long restored = 0; + long ret = 0; + + list_for_each_entry_safe(folio, t_folio, folio_list, lru) { + if (folio_test_hugetlb_vmemmap_optimized(folio)) { + ret = hugetlb_vmemmap_restore(h, &folio->page); + if (ret) + break; + restored++; + } + + /* Add non-optimized folios to output list */ + list_move(&folio->lru, non_hvo_folios); + } + + if (!ret) + ret = restored; + return ret; +} + /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *head) { diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index c512e388dbb4..a0dcf49f46ba 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -19,6 +19,9 @@ #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios); void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); @@ -45,6 +48,14 @@ static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *h return 0; } +static long hugetlb_vmemmap_restore_folios(const struct hstate *h, + struct list_head *folio_list, + struct list_head *non_hvo_folios) +{ + list_splice_init(folio_list, non_hvo_folios); + return 0; +} + static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { } -- cgit v1.2.3 From 91f386bf0772c1976622bd0b119b78094603c3d0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:07 -0700 Subject: hugetlb: batch freeing of vmemmap pages Now that batching of hugetlb vmemmap optimization processing is possible, batch the freeing of vmemmap pages. When freeing vmemmap pages for a hugetlb page, we add them to a list that is freed after the entire batch has been processed. This enhances the ability to return contiguous ranges of memory to the low level allocators. Link: https://lkml.kernel.org/r/20231019023113.345257-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Joao Martins Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 82 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 77f44b81ff01..4ac521e596db 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -251,7 +251,7 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, } entry = mk_pte(walk->reuse_page, pgprot); - list_add_tail(&page->lru, walk->vmemmap_pages); + list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } @@ -306,18 +306,20 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. + * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers + * responsibility to free pages. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, - unsigned long reuse) + unsigned long reuse, + struct list_head *vmemmap_pages) { int ret; - LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, + .vmemmap_pages = vmemmap_pages, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -334,7 +336,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); - list_add(&walk.reuse_page->lru, &vmemmap_pages); + list_add(&walk.reuse_page->lru, vmemmap_pages); } /* @@ -365,15 +367,13 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, - .vmemmap_pages = &vmemmap_pages, + .vmemmap_pages = vmemmap_pages, }; vmemmap_remap_range(reuse, end, &walk); } mmap_read_unlock(&init_mm); - free_vmemmap_page_list(&vmemmap_pages); - return ret; } @@ -389,7 +389,7 @@ static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; - list_add_tail(&page->lru, list); + list_add(&page->lru, list); } return 0; @@ -577,24 +577,17 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h return true; } -/** - * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. - * @h: struct hstate. - * @head: the head page whose vmemmap pages will be optimized. - * - * This function only tries to optimize @head's vmemmap pages and does not - * guarantee that the optimization will succeed after it returns. The caller - * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages - * have been optimized. - */ -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +static int __hugetlb_vmemmap_optimize(const struct hstate *h, + struct page *head, + struct list_head *vmemmap_pages) { + int ret = 0; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE(!PageHuge(head)); if (!vmemmap_should_optimize(h, head)) - return; + return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); @@ -604,21 +597,58 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) - * to the page which @vmemmap_reuse is mapped to, then free the pages - * which the range [@vmemmap_start, @vmemmap_end] is mapped to. + * to the page which @vmemmap_reuse is mapped to. Add pages previously + * mapping the range to vmemmap_pages list so that they can be freed by + * the caller. */ - if (vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse)) + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages); + if (ret) static_branch_dec(&hugetlb_optimize_vmemmap_key); else SetHPageVmemmapOptimized(head); + + return ret; +} + +/** + * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be optimized. + * + * This function only tries to optimize @head's vmemmap pages and does not + * guarantee that the optimization will succeed after it returns. The caller + * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages + * have been optimized. + */ +void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +{ + LIST_HEAD(vmemmap_pages); + + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); + free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; + LIST_HEAD(vmemmap_pages); + + list_for_each_entry(folio, folio_list, lru) { + int ret = __hugetlb_vmemmap_optimize(h, &folio->page, + &vmemmap_pages); + + /* + * Pages to be freed may have been accumulated. If we + * encounter an ENOMEM, free what we have and try again. + */ + if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { + free_vmemmap_page_list(&vmemmap_pages); + INIT_LIST_HEAD(&vmemmap_pages); + __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); + } + } - list_for_each_entry(folio, folio_list, lru) - hugetlb_vmemmap_optimize(h, &folio->page); + free_vmemmap_page_list(&vmemmap_pages); } static struct ctl_table hugetlb_vmemmap_sysctls[] = { -- cgit v1.2.3 From f4b7e3efaddbf8fa7cffacb5956b0823b7a7730a Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 18 Oct 2023 19:31:08 -0700 Subject: hugetlb: batch PMD split for bulk vmemmap dedup In an effort to minimize amount of TLB flushes, batch all PMD splits belonging to a range of pages in order to perform only 1 (global) TLB flush. Add a flags field to the walker and pass whether it's a bulk allocation or just a single page to decide to remap. First value (VMEMMAP_SPLIT_NO_TLB_FLUSH) designates the request to not do the TLB flush when we split the PMD. Rebased and updated by Mike Kravetz Link: https://lkml.kernel.org/r/20231019023113.345257-7-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 92 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 4ac521e596db..10739e4285d5 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -27,6 +27,8 @@ * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. + * @flags: used to modify behavior in vmemmap page table walking + * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, @@ -35,9 +37,13 @@ struct vmemmap_remap_walk { struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; + +/* Skip the TLB flush when we split the PMD */ +#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) + unsigned long flags; }; -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, bool flush) { pmd_t __pmd; int i; @@ -80,7 +86,8 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + if (flush) + flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -127,11 +134,20 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr, do { int ret; - ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, + !(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)); if (ret) return ret; next = pmd_addr_end(addr, end); + + /* + * We are only splitting, not remapping the hugetlb vmemmap + * pages. + */ + if (!walk->remap_pte) + continue; + vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -198,7 +214,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); + if (walk->remap_pte) + flush_tlb_kernel_range(start, end); return 0; } @@ -297,6 +314,36 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } +/** + * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) + * backing PMDs of the directmap into PTEs + * @start: start address of the vmemmap virtual address range that we want + * to remap. + * @end: end address of the vmemmap virtual address range that we want to + * remap. + * @reuse: reuse address. + * + * Return: %0 on success, negative error code otherwise. + */ +static int vmemmap_remap_split(unsigned long start, unsigned long end, + unsigned long reuse) +{ + int ret; + struct vmemmap_remap_walk walk = { + .remap_pte = NULL, + .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, + }; + + /* See the comment in the vmemmap_remap_free(). */ + BUG_ON(start - reuse != PAGE_SIZE); + + mmap_read_lock(&init_mm); + ret = vmemmap_remap_range(reuse, end, &walk); + mmap_read_unlock(&init_mm); + + return ret; +} + /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap @@ -320,6 +367,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -368,6 +416,7 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, + .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); @@ -419,6 +468,7 @@ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, + .flags = 0, }; /* See the comment in the vmemmap_remap_free(). */ @@ -628,11 +678,45 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) free_vmemmap_page_list(&vmemmap_pages); } +static int hugetlb_vmemmap_split(const struct hstate *h, struct page *head) +{ + unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; + unsigned long vmemmap_reuse; + + if (!vmemmap_should_optimize(h, head)) + return 0; + + vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); + vmemmap_reuse = vmemmap_start; + vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; + + /* + * Split PMDs on the vmemmap virtual address range [@vmemmap_start, + * @vmemmap_end] + */ + return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); +} + void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { struct folio *folio; LIST_HEAD(vmemmap_pages); + list_for_each_entry(folio, folio_list, lru) { + int ret = hugetlb_vmemmap_split(h, &folio->page); + + /* + * Spliting the PMD requires allocating a page, thus lets fail + * early once we encounter the first OOM. No point in retrying + * as it can be dynamically done on remap with the memory + * we get back from the vmemmap deduplication. + */ + if (ret == -ENOMEM) + break; + } + + flush_tlb_all(); + list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); -- cgit v1.2.3 From f13b83fdd996409e3dbf6f34c7629a9d13fdb9c1 Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Wed, 18 Oct 2023 19:31:09 -0700 Subject: hugetlb: batch TLB flushes when freeing vmemmap Now that a list of pages is deduplicated at once, the TLB flush can be batched for all vmemmap pages that got remapped. Expand the flags field value to pass whether to skip the TLB flush on remap of the PTE. The TLB flush is global as we don't have guarantees from caller that the set of folios is contiguous, or to add complexity in composing a list of kVAs to flush. Modified by Mike Kravetz to perform TLB flush on single folio if an error is encountered. Link: https://lkml.kernel.org/r/20231019023113.345257-8-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 49 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 10739e4285d5..9df350372046 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -40,6 +40,8 @@ struct vmemmap_remap_walk { /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) +/* Skip the TLB flush when we remap the PTE */ +#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) unsigned long flags; }; @@ -214,7 +216,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - if (walk->remap_pte) + if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; @@ -355,19 +357,21 @@ static int vmemmap_remap_split(unsigned long start, unsigned long end, * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. + * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, - .flags = 0, + .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; @@ -629,7 +633,8 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h static int __hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head, - struct list_head *vmemmap_pages) + struct list_head *vmemmap_pages, + unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; @@ -640,6 +645,18 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); + /* + * Very Subtle + * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed + * immediately after remapping. As a result, subsequent accesses + * and modifications to struct pages associated with the hugetlb + * page could be to the OLD struct pages. Set the vmemmap optimized + * flag here so that it is copied to the new head page. This keeps + * the old and new struct pages in sync. + * If there is an error during optimization, we will immediately FLUSH + * the TLB and clear the flag below. + */ + SetHPageVmemmapOptimized(head); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -651,11 +668,12 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ - ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages); - if (ret) + ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, + vmemmap_pages, flags); + if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - else - SetHPageVmemmapOptimized(head); + ClearHPageVmemmapOptimized(head); + } return ret; } @@ -674,7 +692,7 @@ void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -719,19 +737,28 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l list_for_each_entry(folio, folio_list, lru) { int ret = __hugetlb_vmemmap_optimize(h, &folio->page, - &vmemmap_pages); + &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. + * This can occur in the case that both spliting fails + * halfway and head page allocation also failed. In this + * case __hugetlb_vmemmap_optimize() would free memory + * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { + flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize(h, &folio->page, &vmemmap_pages); + __hugetlb_vmemmap_optimize(h, &folio->page, + &vmemmap_pages, + VMEMMAP_REMAP_NO_TLB_FLUSH); } } + flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } -- cgit v1.2.3 From c24f188b22892908a2a3bb2de0ce7d121dd72989 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 18 Oct 2023 19:31:10 -0700 Subject: hugetlb: batch TLB flushes when restoring vmemmap Update the internal hugetlb restore vmemmap code path such that TLB flushing can be batched. Use the existing mechanism of passing the VMEMMAP_REMAP_NO_TLB_FLUSH flag to indicate flushing should not be performed for individual pages. The routine hugetlb_vmemmap_restore_folios is the only user of this new mechanism, and it will perform a global flush after all vmemmap is restored. Link: https://lkml.kernel.org/r/20231019023113.345257-9-mike.kravetz@oracle.com Signed-off-by: Joao Martins Signed-off-by: Mike Kravetz Reviewed-by: Muchun Song Cc: Anshuman Khandual Cc: Barry Song <21cnbao@gmail.com> Cc: David Hildenbrand Cc: David Rientjes Cc: James Houghton Cc: Konrad Dybcio Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Sergey Senozhatsky Cc: Usama Arif Cc: Xiongchun Duan Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9df350372046..d2999c303031 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -461,18 +461,19 @@ out: * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. + * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, - unsigned long reuse) + unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, - .flags = 0, + .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ @@ -494,17 +495,7 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -/** - * hugetlb_vmemmap_restore - restore previously optimized (by - * hugetlb_vmemmap_optimize()) vmemmap pages which - * will be reallocated and remapped. - * @h: struct hstate. - * @head: the head page whose vmemmap pages will be restored. - * - * Return: %0 if @head's vmemmap pages have been reallocated and remapped, - * negative error code otherwise. - */ -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; @@ -525,7 +516,7 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ - ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse); + ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { ClearHPageVmemmapOptimized(head); static_branch_dec(&hugetlb_optimize_vmemmap_key); @@ -534,6 +525,21 @@ int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) return ret; } +/** + * hugetlb_vmemmap_restore - restore previously optimized (by + * hugetlb_vmemmap_optimize()) vmemmap pages which + * will be reallocated and remapped. + * @h: struct hstate. + * @head: the head page whose vmemmap pages will be restored. + * + * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * negative error code otherwise. + */ +int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +{ + return __hugetlb_vmemmap_restore(h, head, 0); +} + /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. @@ -557,7 +563,8 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = hugetlb_vmemmap_restore(h, &folio->page); + ret = __hugetlb_vmemmap_restore(h, &folio->page, + VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; restored++; @@ -567,6 +574,8 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_move(&folio->lru, non_hvo_folios); } + if (restored) + flush_tlb_all(); if (!ret) ret = restored; return ret; -- cgit v1.2.3 From c5ad3233ead566d74918bd76ba2dbdbe83ba1d9d Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Wed, 11 Oct 2023 15:45:57 +0100 Subject: hugetlb_vmemmap: use folio argument for hugetlb_vmemmap_* functions Most function calls in hugetlb.c are made with folio arguments. This brings hugetlb_vmemmap calls inline with them by using folio instead of head struct page. Head struct page is still needed within these functions. The set/clear/test functions for hugepages are also changed to folio versions. Link: https://lkml.kernel.org/r/20231011144557.1720481-2-usama.arif@bytedance.com Signed-off-by: Usama Arif Reviewed-by: Muchun Song Cc: Fam Zheng Cc: Mike Kravetz Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++------- mm/hugetlb_vmemmap.c | 50 ++++++++++++++++++++++++++------------------------ mm/hugetlb_vmemmap.h | 8 ++++---- 3 files changed, 37 insertions(+), 35 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cf834bb7f820..dd8065e36038 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1747,10 +1747,10 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, /* * If folio is not vmemmap optimized (!clear_dtor), then the folio - * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore + * is no longer identified as a hugetlb page. hugetlb_vmemmap_restore_folio * can only be passed hugetlb pages and will BUG otherwise. */ - if (clear_dtor && hugetlb_vmemmap_restore(h, &folio->page)) { + if (clear_dtor && hugetlb_vmemmap_restore_folio(h, folio)) { spin_lock_irq(&hugetlb_lock); /* * If we cannot allocate vmemmap pages, just refuse to free the @@ -1893,7 +1893,7 @@ static void bulk_vmemmap_restore_error(struct hstate *h, * quit processing the list to retry the bulk operation. */ list_for_each_entry_safe(folio, t_folio, folio_list, lru) - if (hugetlb_vmemmap_restore(h, &folio->page)) { + if (hugetlb_vmemmap_restore_folio(h, folio)) { list_del(&folio->lru); spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, true); @@ -2051,7 +2051,7 @@ static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio) static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { init_new_hugetlb_folio(h, folio); - hugetlb_vmemmap_optimize(h, &folio->page); + hugetlb_vmemmap_optimize_folio(h, folio); } static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) @@ -2462,7 +2462,7 @@ retry: * non-vmemmap optimized hugetlb folios. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { spin_lock_irq(&hugetlb_lock); add_hugetlb_folio(h, folio, false); @@ -3886,11 +3886,11 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) /* * If vmemmap already existed for folio, the remove routine above would * have cleared the hugetlb folio flag. Hence the folio is technically - * no longer a hugetlb folio. hugetlb_vmemmap_restore can only be + * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be * passed hugetlb folios and will BUG otherwise. */ if (folio_test_hugetlb(folio)) { - rc = hugetlb_vmemmap_restore(h, &folio->page); + rc = hugetlb_vmemmap_restore_folio(h, folio); if (rc) { /* Allocation of vmemmmap failed, we can not demote folio */ spin_lock_irq(&hugetlb_lock); diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index d2999c303031..87818ee7f01d 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -495,14 +495,15 @@ EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0); -static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, unsigned long flags) +static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE(!PageHuge(head)); - if (!HPageVmemmapOptimized(head)) + if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); @@ -518,7 +519,7 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } @@ -526,18 +527,18 @@ static int __hugetlb_vmemmap_restore(const struct hstate *h, struct page *head, } /** - * hugetlb_vmemmap_restore - restore previously optimized (by - * hugetlb_vmemmap_optimize()) vmemmap pages which + * hugetlb_vmemmap_restore_folio - restore previously optimized (by + * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be restored. + * @folio: the folio whose vmemmap pages will be restored. * - * Return: %0 if @head's vmemmap pages have been reallocated and remapped, + * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { - return __hugetlb_vmemmap_restore(h, head, 0); + return __hugetlb_vmemmap_restore_folio(h, folio, 0); } /** @@ -563,7 +564,7 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h, list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { - ret = __hugetlb_vmemmap_restore(h, &folio->page, + ret = __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_REMAP_NO_TLB_FLUSH); if (ret) break; @@ -640,12 +641,13 @@ static bool vmemmap_should_optimize(const struct hstate *h, const struct page *h return true; } -static int __hugetlb_vmemmap_optimize(const struct hstate *h, - struct page *head, +static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, + struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; + struct page *head = &folio->page; unsigned long vmemmap_start = (unsigned long)head, vmemmap_end; unsigned long vmemmap_reuse; @@ -665,7 +667,7 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ - SetHPageVmemmapOptimized(head); + folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; @@ -681,27 +683,27 @@ static int __hugetlb_vmemmap_optimize(const struct hstate *h, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); - ClearHPageVmemmapOptimized(head); + folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** - * hugetlb_vmemmap_optimize - optimize @head page's vmemmap pages. + * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. - * @head: the head page whose vmemmap pages will be optimized. + * @folio: the folio whose vmemmap pages will be optimized. * - * This function only tries to optimize @head's vmemmap pages and does not + * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller - * can use HPageVmemmapOptimized(@head) to detect if @head's vmemmap pages - * have been optimized. + * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's + * vmemmap pages have been optimized. */ -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); - __hugetlb_vmemmap_optimize(h, head, &vmemmap_pages, 0); + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0); free_vmemmap_page_list(&vmemmap_pages); } @@ -745,7 +747,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { - int ret = __hugetlb_vmemmap_optimize(h, &folio->page, + int ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); @@ -754,14 +756,14 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this - * case __hugetlb_vmemmap_optimize() would free memory + * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); - __hugetlb_vmemmap_optimize(h, &folio->page, + __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_REMAP_NO_TLB_FLUSH); } diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index a0dcf49f46ba..9fd0cb270502 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -18,11 +18,11 @@ #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP -int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head); +int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio); long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios); -void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head); +void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio); void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list); static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h) @@ -43,7 +43,7 @@ static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate return size > 0 ? size : 0; } #else -static inline int hugetlb_vmemmap_restore(const struct hstate *h, struct page *head) +static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return 0; } @@ -56,7 +56,7 @@ static long hugetlb_vmemmap_restore_folios(const struct hstate *h, return 0; } -static inline void hugetlb_vmemmap_optimize(const struct hstate *h, struct page *head) +static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { } -- cgit v1.2.3 From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:49 +0100 Subject: buffer: make folio_create_empty_buffers() return a buffer_head Patch series "Finish the create_empty_buffers() transition", v2. Pankaj recently added folio_create_empty_buffers() as the folio equivalent to create_empty_buffers(). This patch set finishes the conversion by first converting all remaining filesystems to call folio_create_empty_buffers(), then renaming it back to create_empty_buffers(). I took the opportunity to make a few simplifications like making folio_create_empty_buffers() return the head buffer and extracting get_nth_bh() from nilfs2. A few of the patches in this series aren't directly related to create_empty_buffers(), but I saw them while I was working on this and thought they'd be easy enough to add to this series. Compile-tested only, other than ext4. This patch (of 26): Almost all callers want to know the first BH that was allocated for this folio. We already have that handy, so return it. Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 24 +++++++++++++----------- include/linux/buffer_head.h | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 86ddfa52c699..bfa7604d1891 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,8 +1641,8 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state) +struct buffer_head *folio_create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; @@ -1669,6 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, } folio_attach_private(folio, head); spin_unlock(&folio->mapping->private_lock); + + return head; } EXPORT_SYMBOL(folio_create_empty_buffers); @@ -1770,13 +1772,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) { + struct buffer_head *bh; + BUG_ON(!folio_test_locked(folio)); - if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, - 1 << READ_ONCE(inode->i_blkbits), - b_state); - return folio_buffers(folio); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), b_state); + return bh; } /* @@ -2676,10 +2680,8 @@ int block_truncate_page(struct address_space *mapping, return PTR_ERR(folio); bh = folio_buffers(folio); - if (!bh) { - folio_create_empty_buffers(folio, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = folio_create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3dc4720e4773..1001244a8941 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -203,8 +203,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state); +struct buffer_head *folio_create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); void end_buffer_async_write(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 4f05f139e3f8938c4c456bd886355c9bbcc3190d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:50 +0100 Subject: mpage: convert map_buffer_to_folio() to folio_create_empty_buffers() Saves a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/mpage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/mpage.c b/fs/mpage.c index 242e213ee064..964a6efe594d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, folio_mark_uptodate(folio); return; } - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); + head = folio_create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; -- cgit v1.2.3 From d4059993674b533798a551859042957bb5578332 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:51 +0100 Subject: ext4: convert to folio_create_empty_buffers Remove an unnecessary folio->page->folio conversion and take advantage of the new return value from folio_create_empty_buffers(). Link: https://lkml.kernel.org/r/20231016201114.1928083-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 14 +++++--------- fs/ext4/move_extent.c | 11 +++++------ 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ce35f1c8b0a..8e431ff2fd95 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1020,10 +1020,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, BUG_ON(from > to); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1153,7 +1151,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); + folio_create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -3643,10 +3641,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); - if (!bh) { - create_empty_buffers(&folio->page, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = folio_create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 18a9e7c47975..7fe448fb948b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; @@ -380,9 +378,10 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!folio_buffers(folio[0])) - create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); bh = folio_buffers(folio[0]); + if (!bh) + bh = folio_create_empty_buffers(folio[0], + 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { -- cgit v1.2.3 From 0217fbb0271a7c78e16629cf45375916ec2eb35f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:52 +0100 Subject: buffer: add get_nth_bh() Extract this useful helper from nilfs_page_get_nth_block() Link: https://lkml.kernel.org/r/20231016201114.1928083-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.h | 7 +------ include/linux/buffer_head.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 21ddcdd4d63e..344d71942d36 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -55,12 +55,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, static inline struct buffer_head * nilfs_page_get_nth_block(struct page *page, unsigned int count) { - struct buffer_head *bh = page_buffers(page); - - while (count-- > 0) - bh = bh->b_this_page; - get_bh(bh); - return bh; + return get_nth_bh(page_buffers(page), count); } #endif /* _NILFS_PAGE_H */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 1001244a8941..3d85a0cf0ca5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -457,6 +457,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size) return __bread_gfp(bdev, block, size, __GFP_MOVABLE); } +/** + * get_nth_bh - Get a reference on the n'th buffer after this one. + * @bh: The buffer to start counting from. + * @count: How many buffers to skip. + * + * This is primarily useful for finding the nth buffer in a folio; in + * that case you pass the head buffer and the byte offset in the folio + * divided by the block size. It can be used for other purposes, but + * it will wrap at the end of the folio rather than returning NULL or + * proceeding to the next folio for you. + * + * Return: The requested buffer with an elevated refcount. + */ +static inline __must_check +struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count) +{ + while (count--) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_BUFFER_HEAD -- cgit v1.2.3 From 81cb277ebdfd73b4b4cbfcdf377b4b514e90520a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:53 +0100 Subject: gfs2: convert inode unstuffing to use a folio Use the folio APIs, removing numerous hidden calls to compound_head(). Also remove the stale comment about the page being looked up if it's NULL. Link: https://lkml.kernel.org/r/20231016201114.1928083-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/bmap.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ef7017fb6951..247d2c16593c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -43,53 +43,51 @@ struct metapath { static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); /** - * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @page: The (optional) page. This is looked up if @page is NULL + * @folio: The folio. * * Returns: errno */ - -static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, - u64 block, struct page *page) +static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct folio *folio) { struct inode *inode = &ip->i_inode; - if (!PageUptodate(page)) { - void *kaddr = kmap(page); + if (!folio_test_uptodate(folio)) { + void *kaddr = kmap_local_folio(folio, 0); u64 dsize = i_size_read(inode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap(page); + memset(kaddr + dsize, 0, folio_size(folio) - dsize); + kunmap_local(kaddr); - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (gfs2_is_jdata(ip)) { - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (!bh) + bh = folio_create_empty_buffers(folio, + BIT(inode->i_blkbits), BIT(BH_Uptodate)); - bh = page_buffers(page); if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } else { - set_page_dirty(page); + folio_mark_dirty(folio); gfs2_ordered_add_inode(ip); } return 0; } -static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; @@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { - error = gfs2_unstuffer_page(ip, dibh, block, page); + error = gfs2_unstuffer_folio(ip, dibh, block, folio); if (error) goto out_brelse; } @@ -157,17 +155,17 @@ out_brelse: int gfs2_unstuff_dinode(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; - struct page *page; + struct folio *folio; int error; down_write(&ip->i_rw_mutex); - page = grab_cache_page(inode->i_mapping, 0); - error = -ENOMEM; - if (!page) + folio = filemap_grab_folio(inode->i_mapping, 0); + error = PTR_ERR(folio); + if (IS_ERR(folio)) goto out; - error = __gfs2_unstuff_inode(ip, page); - unlock_page(page); - put_page(page); + error = __gfs2_unstuff_inode(ip, folio); + folio_unlock(folio); + folio_put(folio); out: up_write(&ip->i_rw_mutex); return error; -- cgit v1.2.3 From 0eb751791df86dc05a87fff1ada8d37044267a7e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:54 +0100 Subject: gfs2: convert gfs2_getbuf() to folios Remove several folio->page->folio conversions. Also use __GFP_NOFAIL instead of calling yield() and the new get_nth_bh(). Link: https://lkml.kernel.org/r/20231016201114.1928083-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/meta_io.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 924361fa510b..f1fac1b45059 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift; unsigned long index; @@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) bufnum = blkno - (index << shift); /* block buf index within page */ if (create) { - for (;;) { - page = grab_cache_page(mapping, index); - if (page) - break; - yield(); - } - if (!page_has_buffers(page)) - create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping) | __GFP_NOFAIL); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, + sdp->sd_sb.sb_bsize, 0); } else { - page = find_get_page_flags(mapping, index, - FGP_LOCK|FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - if (!page_has_buffers(page)) { - bh = NULL; - goto out_unlock; - } + bh = folio_buffers(folio); } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); + if (!bh) + goto out_unlock; + bh = get_nth_bh(bh, bufnum); if (!buffer_mapped(bh)) map_bh(bh, sdp->sd_vfs, blkno); out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return bh; } -- cgit v1.2.3 From c646e573729bfe885280c343203c775f1f5d71c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:55 +0100 Subject: gfs2: convert gfs2_getjdatabuf to use a folio Use the folio APIs, saving four hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/meta_io.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f1fac1b45059..f6d40d51f5ed 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -400,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) { struct address_space *mapping = ip->i_inode.i_mapping; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; unsigned long index = blkno >> shift; /* convert block to page */ unsigned int bufnum = blkno - (index << shift); - page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); - if (!page) - return NULL; - if (!page_has_buffers(page)) { - unlock_page(page); - put_page(page); + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); - unlock_page(page); - put_page(page); + bh = folio_buffers(folio); + if (bh) + bh = get_nth_bh(bh, bufnum); + folio_unlock(folio); + folio_put(folio); return bh; } -- cgit v1.2.3 From 4064a0aa8a6a341b14fb9ee8cf0f0bb86c575e3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:56 +0100 Subject: gfs2: convert gfs2_write_buf_to_page() to use a folio Remove several folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/quota.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index d3d013d1d5ac..25354278cecb 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -749,7 +749,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; @@ -758,15 +758,15 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, bsize, 0); - bh = page_buffers(page); - for(;;) { - /* Find the beginning block within the page */ + for (;;) { + /* Find the beginning block within the folio */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; bnum++; @@ -779,9 +779,10 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, goto unlock_out; /* If it's a newly allocated disk block, zero it */ if (buffer_new(bh)) - zero_user(page, bnum * bsize, bh->b_size); + folio_zero_range(folio, bnum * bsize, + bh->b_size); } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; @@ -797,17 +798,17 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, break; } - /* Write to the page, now that we have setup the buffer(s) */ - memcpy_to_page(page, off, buf, bytes); - flush_dcache_page(page); - unlock_page(page); - put_page(page); + /* Write to the folio, now that we have setup the buffer(s) */ + memcpy_to_folio(folio, off, buf, bytes); + flush_dcache_folio(folio); + folio_unlock(folio); + folio_put(folio); return 0; unlock_out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return -EIO; } -- cgit v1.2.3 From 6c346be91dcf2b588510e41e4e04c0638af3d536 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:57 +0100 Subject: nilfs2: convert nilfs_mdt_freeze_buffer to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 19c8158605ed..db2260d6e44d 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -560,17 +560,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen; - struct page *page; + struct folio *folio; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); - if (!page) - return -ENOMEM; + folio = filemap_grab_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, 0); + bh_frozen = folio_buffers(folio); + if (!bh_frozen) + bh_frozen = folio_create_empty_buffers(folio, 1 << blkbits, 0); - bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); + bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); @@ -582,8 +584,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) brelse(bh_frozen); /* already frozen */ } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } -- cgit v1.2.3 From c5521c7689b892c8ea684a284722b5584ba1ce8f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:58 +0100 Subject: nilfs2: convert nilfs_grab_buffer() to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b4e54d079b7d..1c075bd906c9 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -25,19 +25,19 @@ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked)) -static struct buffer_head * -__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, - int blkbits, unsigned long b_state) +static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, + unsigned long block, pgoff_t index, int blkbits, + unsigned long b_state) { unsigned long first_block; - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, b_state); + if (!bh) + bh = folio_create_empty_buffers(folio, 1 << blkbits, b_state); first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = get_nth_bh(bh, block - first_block); touch_buffer(bh); wait_on_buffer(bh); @@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, { int blkbits = inode->i_blkbits; pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits); - struct page *page; + struct folio *folio; struct buffer_head *bh; - page = grab_cache_page(mapping, index); - if (unlikely(!page)) + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) return NULL; - bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state); if (unlikely(!bh)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } return bh; -- cgit v1.2.3 From 4093602d6bbb2c82b922d1b442a022a069cdb59e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:59 +0100 Subject: nilfs2: convert nilfs_copy_page() to nilfs_copy_folio() Both callers already have a folio, so pass it in and use it directly. Removes a lot of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 50 ++++++++++++++++++++++++++------------------------ mm/util.c | 1 + 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 1c075bd906c9..696215d899bf 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page) } /** - * nilfs_copy_page -- copy the page with buffers - * @dst: destination page - * @src: source page - * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * nilfs_copy_folio -- copy the folio with buffers + * @dst: destination folio + * @src: source folio + * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads. * - * This function is for both data pages and btnode pages. The dirty flag - * should be treated by caller. The page must not be under i/o. - * Both src and dst page must be locked + * This function is for both data folios and btnode folios. The dirty flag + * should be treated by caller. The folio must not be under i/o. + * Both src and dst folio must be locked */ -static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +static void nilfs_copy_folio(struct folio *dst, struct folio *src, + bool copy_dirty) { struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; - BUG_ON(PageWriteback(dst)); + BUG_ON(folio_test_writeback(dst)); - sbh = page_buffers(src); - if (!page_has_buffers(dst)) - create_empty_buffers(dst, sbh->b_size, 0); + sbh = folio_buffers(src); + dbh = folio_buffers(dst); + if (!dbh) + dbh = folio_create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) mask |= BIT(BH_Dirty); - dbh = dbufs = page_buffers(dst); + dbufs = dbh; do { lock_buffer(sbh); lock_buffer(dbh); @@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) dbh = dbh->b_this_page; } while (dbh != dbufs); - copy_highpage(dst, src); + folio_copy(dst, src); - if (PageUptodate(src) && !PageUptodate(dst)) - SetPageUptodate(dst); - else if (!PageUptodate(src) && PageUptodate(dst)) - ClearPageUptodate(dst); - if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) - SetPageMappedToDisk(dst); - else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) - ClearPageMappedToDisk(dst); + if (folio_test_uptodate(src) && !folio_test_uptodate(dst)) + folio_mark_uptodate(dst); + else if (!folio_test_uptodate(src) && folio_test_uptodate(dst)) + folio_clear_uptodate(dst); + if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst)) + folio_set_mappedtodisk(dst); + else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst)) + folio_clear_mappedtodisk(dst); do { unlock_buffer(sbh); @@ -269,7 +271,7 @@ repeat: NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(&dfolio->page, &folio->page, 1); + nilfs_copy_folio(dfolio, folio, true); filemap_dirty_folio(folio_mapping(dfolio), dfolio); folio_unlock(dfolio); @@ -314,7 +316,7 @@ repeat: if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); - nilfs_copy_page(&dfolio->page, &folio->page, 0); + nilfs_copy_folio(dfolio, folio, false); folio_unlock(dfolio); folio_put(dfolio); /* Do we not need to remove folio from smap here? */ diff --git a/mm/util.c b/mm/util.c index 8cbbfd3a3d59..eefa0336d38c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -799,6 +799,7 @@ void folio_copy(struct folio *dst, struct folio *src) cond_resched(); } } +EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; -- cgit v1.2.3 From 1a846bf3884694b2f9ecbd70011927c2fb40f224 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:00 +0100 Subject: nilfs2: convert nilfs_mdt_forget_block() to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index db2260d6e44d..11b7cf4acc92 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { - pgoff_t index = (pgoff_t)block >> - (PAGE_SHIFT - inode->i_blkbits); - struct page *page; - unsigned long first_block; + pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits); + struct folio *folio; + struct buffer_head *bh; int ret = 0; int still_dirty; - page = find_lock_page(inode->i_mapping, index); - if (!page) + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) return -ENOENT; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - first_block = (unsigned long)index << - (PAGE_SHIFT - inode->i_blkbits); - if (page_has_buffers(page)) { - struct buffer_head *bh; - - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = folio_buffers(folio); + if (bh) { + unsigned long first_block = index << + (PAGE_SHIFT - inode->i_blkbits); + bh = get_nth_bh(bh, block - first_block); nilfs_forget_buffer(bh); } - still_dirty = PageDirty(page); - unlock_page(page); - put_page(page); + still_dirty = folio_test_dirty(folio); + folio_unlock(folio); + folio_put(folio); if (still_dirty || invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) -- cgit v1.2.3 From 664c87b75ef6ec3ebb97383891a6787b63925547 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:01 +0100 Subject: nilfs2: convert nilfs_mdt_get_frozen_buffer to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 11b7cf4acc92..7b754e6494d7 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen = NULL; - struct page *page; + struct folio *folio; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); - if (page) { - if (page_has_buffers(page)) { + folio = filemap_lock_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (!IS_ERR(folio)) { + bh_frozen = folio_buffers(folio); + if (bh_frozen) { n = bh_offset(bh) >> inode->i_blkbits; - bh_frozen = nilfs_page_get_nth_block(page, n); + bh_frozen = get_nth_bh(bh_frozen, n); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return bh_frozen; } -- cgit v1.2.3 From 73c32e07a3977f8470b29889d30d2f808e6fee4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:02 +0100 Subject: nilfs2: remove nilfs_page_get_nth_block All users have now been converted to get_nth_block(). Link: https://lkml.kernel.org/r/20231016201114.1928083-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 344d71942d36..d249ea1cefff 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -52,10 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, #define NILFS_PAGE_BUG(page, m, a...) \ do { nilfs_page_bug(page); BUG(); } while (0) -static inline struct buffer_head * -nilfs_page_get_nth_block(struct page *page, unsigned int count) -{ - return get_nth_bh(page_buffers(page), count); -} - #endif /* _NILFS_PAGE_H */ -- cgit v1.2.3 From 922b12eff0b293fc13ae4045c7d7264e18938878 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:03 +0100 Subject: nilfs2: convert nilfs_lookup_dirty_data_buffers to use folio_create_empty_buffers This function was already using a folio, so this update to the new API removes a single folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7ec16879756e..94388fe83cf8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, continue; } head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, + i_blocksize(inode), 0); folio_unlock(folio); bh = head; -- cgit v1.2.3 From a2da3afce96ce747ec0e1670bfc73fec85d20f95 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:04 +0100 Subject: ntfs: convert ntfs_read_block() to use a folio The caller already has the folio, so pass it in and use the folio API throughout saving five hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/aops.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 4e158bce4192..d66a9f5ffde9 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -145,13 +145,12 @@ still_busy: } /** - * ntfs_read_block - fill a @page of an address space with data - * @page: page cache page to fill with data + * ntfs_read_block - fill a @folio of an address space with data + * @folio: page cache folio to fill with data * - * Fill the page @page of the address space belonging to the @page->host inode. * We read each buffer asynchronously and when all buffers are read in, our io * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the page before finally marking it uptodate and + * applies the mst fixups to the folio before finally marking it uptodate and * unlocking it. * * We only enforce allocated_size limit because i_size is checked for in @@ -161,7 +160,7 @@ still_busy: * * Contains an adapted version of fs/buffer.c::block_read_full_folio(). */ -static int ntfs_read_block(struct page *page) +static int ntfs_read_block(struct folio *folio) { loff_t i_size; VCN vcn; @@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page) int i, nr; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; @@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page) blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) { - unlock_page(page); - return -ENOMEM; - } - } - bh = head = page_buffers(page); - BUG_ON(!bh); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); + bh = head; /* * We may be racing with truncate. To avoid some of the problems we @@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page) * may leave some buffers unmapped which are now allocated. This is * not a problem since these buffers will just get mapped when a write * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the page is being + * on due to the runlist being incomplete and if the folio is being * fully truncated, truncate will throw it away as soon as we unlock * it so no need to worry what we do with it. */ - iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; init_size = ni->initialized_size; @@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page) } zblock = (init_size + blocksize - 1) >> blocksize_bits; - /* Loop through all the buffers in the page. */ + /* Loop through all the buffers in the folio. */ rl = NULL; nr = i = 0; do { @@ -299,7 +293,7 @@ lock_retry_remap: if (!err) err = -EIO; bh->b_blocknr = -1; - SetPageError(page); + folio_set_error(folio); ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " "attribute type 0x%x, vcn 0x%llx, " "offset 0x%x because its location on " @@ -312,13 +306,13 @@ lock_retry_remap: /* * Either iblock was outside lblock limits or * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the page and set the buffer uptodate. + * of the folio and set the buffer uptodate. */ handle_hole: bh->b_blocknr = -1UL; clear_buffer_mapped(bh); handle_zblock: - zero_user(page, i * blocksize, blocksize); + folio_zero_range(folio, i * blocksize, blocksize); if (likely(!err)) set_buffer_uptodate(bh); } while (i++, iblock++, (bh = bh->b_this_page) != head); @@ -349,11 +343,11 @@ handle_zblock: return 0; } /* No i/o was scheduled on any of the buffers. */ - if (likely(!PageError(page))) - SetPageUptodate(page); + if (likely(!folio_test_error(folio))) + folio_mark_uptodate(folio); else /* Signal synchronous i/o error. */ nr = -EIO; - unlock_page(page); + folio_unlock(folio); return nr; } @@ -433,7 +427,7 @@ retry_readpage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* Normal, non-resident data stream. */ - return ntfs_read_block(page); + return ntfs_read_block(folio); } /* * Attribute is resident, implying it is not compressed or encrypted. -- cgit v1.2.3 From a04eb7cb186b4d537eefc2d68dba8a7e5eb7e6d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:05 +0100 Subject: ntfs: convert ntfs_writepage to use a folio Use folio APIs throughout. Saves many hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/aops.c | 211 +++++++++++++++++++++++++++------------------------------ 1 file changed, 100 insertions(+), 111 deletions(-) diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d66a9f5ffde9..c4426992a2ee 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -501,28 +501,29 @@ err_out: #ifdef NTFS_RW /** - * ntfs_write_block - write a @page to the backing store - * @page: page cache page to write out + * ntfs_write_block - write a @folio to the backing store + * @folio: page cache folio to write out * @wbc: writeback control structure * - * This function is for writing pages belonging to non-resident, non-mst + * This function is for writing folios belonging to non-resident, non-mst * protected attributes to their backing store. * - * For a page with buffers, map and write the dirty buffers asynchronously - * under page writeback. For a page without buffers, create buffers for the - * page, then proceed as above. + * For a folio with buffers, map and write the dirty buffers asynchronously + * under folio writeback. For a folio without buffers, create buffers for the + * folio, then proceed as above. * - * If a page doesn't have buffers the page dirty state is definitive. If a page - * does have buffers, the page dirty state is just a hint, and the buffer dirty - * state is definitive. (A hint which has rules: dirty buffers against a clean - * page is illegal. Other combinations are legal and need to be handled. In - * particular a dirty page containing clean buffers for example.) + * If a folio doesn't have buffers the folio dirty state is definitive. If + * a folio does have buffers, the folio dirty state is just a hint, + * and the buffer dirty state is definitive. (A hint which has rules: + * dirty buffers against a clean folio is illegal. Other combinations are + * legal and need to be handled. In particular a dirty folio containing + * clean buffers for example.) * * Return 0 on success and -errno on error. * * Based on ntfs_read_block() and __block_write_full_folio(). */ -static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) { VCN vcn; LCN lcn; @@ -540,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) bool need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, page->index); + "0x%lx.", ni->mft_no, ni->type, folio->index); BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - BUG_ON(!PageUptodate(page)); - create_empty_buffers(page, blocksize, + head = folio_buffers(folio); + if (!head) { + BUG_ON(!folio_test_uptodate(folio)); + head = folio_create_empty_buffers(folio, blocksize, (1 << BH_Uptodate) | (1 << BH_Dirty)); - if (unlikely(!page_has_buffers(page))) { - ntfs_warning(vol->sb, "Error allocating page " - "buffers. Redirtying page so we try " - "again later."); - /* - * Put the page back on mapping->dirty_pages, but leave - * its buffers' dirty state as-is. - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } } - bh = head = page_buffers(page); - BUG_ON(!bh); + bh = head; /* NOTE: Different naming scheme to ntfs_read_block()! */ - /* The first block in the page. */ - block = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + /* The first block in the folio. */ + block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(vi); @@ -591,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ /* - * Loop through all the buffers in the page, mapping all the dirty + * Loop through all the buffers in the folio, mapping all the dirty * buffers to disk addresses and handling any aliases from the * underlying block device's mapping. */ @@ -610,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely(block >= dblock)) { /* * Mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. The contents of such buffers * were zeroed by ntfs_writepage(). * * FIXME: What about the small race window where * ntfs_writepage() has not done any clearing because - * the page was within i_size but before we get here, + * the folio was within i_size but before we get here, * vmtruncate() modifies i_size? */ clear_buffer_dirty(bh); @@ -632,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely((block >= iblock) && (initialized_size < i_size))) { /* - * If this page is fully outside initialized - * size, zero out all pages between the current - * initialized size and the current page. Just + * If this folio is fully outside initialized + * size, zero out all folios between the current + * initialized size and the current folio. Just * use ntfs_read_folio() to do the zeroing * transparently. */ if (block > iblock) { // TODO: - // For each page do: - // - read_cache_page() - // Again for each page do: - // - wait_on_page_locked() - // - Check (PageUptodate(page) && - // !PageError(page)) + // For each folio do: + // - read_cache_folio() + // Again for each folio do: + // - wait_on_folio_locked() + // - Check (folio_test_uptodate(folio) && + // !folio_test_error(folio)) // Update initialized size in the attribute and // in the inode. - // Again, for each page do: + // Again, for each folio do: // block_dirty_folio(); - // put_page() + // folio_put() // We don't need to wait on the writes. // Update iblock. } /* - * The current page straddles initialized size. Zero + * The current folio straddles initialized size. Zero * all non-uptodate buffers and set them uptodate (and * dirty?). Note, there aren't any non-uptodate buffers - * if the page is uptodate. - * FIXME: For an uptodate page, the buffers may need to + * if the folio is uptodate. + * FIXME: For an uptodate folio, the buffers may need to * be written out because they were not initialized on * disk before. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { // TODO: // Zero any non-uptodate buffers up to i_size. // Set them uptodate and dirty. @@ -721,14 +710,14 @@ lock_retry_remap: unsigned long *bpos, *bend; /* Check if the buffer is zero. */ - kaddr = kmap_atomic(page); - bpos = (unsigned long *)(kaddr + bh_offset(bh)); - bend = (unsigned long *)((u8*)bpos + blocksize); + kaddr = kmap_local_folio(folio, bh_offset(bh)); + bpos = (unsigned long *)kaddr; + bend = (unsigned long *)(kaddr + blocksize); do { if (unlikely(*bpos)) break; } while (likely(++bpos < bend)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (bpos == bend) { /* * Buffer is zero and sparse, no need to write @@ -768,7 +757,7 @@ lock_retry_remap: if (err == -ENOENT || lcn == LCN_ENOENT) { bh->b_blocknr = -1; clear_buffer_dirty(bh); - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); err = 0; continue; @@ -795,7 +784,7 @@ lock_retry_remap: bh = head; /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { int uptodate = 1; do { if (!buffer_uptodate(bh)) { @@ -805,7 +794,7 @@ lock_retry_remap: } } while ((bh = bh->b_this_page) != head); if (uptodate) - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Setup all mapped, dirty buffers for async write i/o. */ @@ -820,7 +809,7 @@ lock_retry_remap: } else if (unlikely(err)) { /* * For the error case. The buffer may have been set - * dirty during attachment to a dirty page. + * dirty during attachment to a dirty folio. */ if (err != -ENOMEM) clear_buffer_dirty(bh); @@ -833,20 +822,20 @@ lock_retry_remap: err = 0; else if (err == -ENOMEM) { ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying page so we try again " + "Redirtying folio so we try again " "later."); /* - * Put the page back on mapping->dirty_pages, but + * Put the folio back on mapping->dirty_pages, but * leave its buffer's dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else - SetPageError(page); + folio_set_error(folio); } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ /* Submit the prepared buffers for i/o. */ need_end_writeback = true; @@ -858,11 +847,11 @@ lock_retry_remap: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); - /* If no i/o was started, need to end_page_writeback(). */ + /* If no i/o was started, need to end writeback here. */ if (unlikely(need_end_writeback)) - end_page_writeback(page); + folio_end_writeback(folio); ntfs_debug("Done."); return err; @@ -1331,8 +1320,9 @@ done: */ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); loff_t i_size; - struct inode *vi = page->mapping->host; + struct inode *vi = folio->mapping->host; ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); char *addr; ntfs_attr_search_ctx *ctx = NULL; @@ -1341,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) int err; retry_writepage: - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + /* Is the folio fully outside i_size? (truncate in progress) */ + if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) { - struct folio *folio = page_folio(page); /* - * The page may have dirty, unmapped buffers. Make them + * The folio may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ block_invalidate_folio(folio, 0, folio_size(folio)); @@ -1367,7 +1356,7 @@ retry_writepage: if (ni->type != AT_INDEX_ALLOCATION) { /* If file is encrypted, deny access, just like NT4. */ if (NInoEncrypted(ni)) { - unlock_page(page); + folio_unlock(folio); BUG_ON(ni->type != AT_DATA); ntfs_debug("Denying write access to encrypted file."); return -EACCES; @@ -1378,14 +1367,14 @@ retry_writepage: BUG_ON(ni->name_len); // TODO: Implement and replace this with // return ntfs_write_compressed_block(page); - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to compressed files is " "not supported yet. Sorry."); return -EOPNOTSUPP; } // TODO: Implement and remove this check. if (NInoNonResident(ni) && NInoSparse(ni)) { - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to sparse files is not " "supported yet. Sorry."); return -EOPNOTSUPP; @@ -1394,34 +1383,34 @@ retry_writepage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* We have to zero every time due to mmap-at-end-of-file. */ - if (page->index >= (i_size >> PAGE_SHIFT)) { - /* The page straddles i_size. */ - unsigned int ofs = i_size & ~PAGE_MASK; - zero_user_segment(page, ofs, PAGE_SIZE); + if (folio->index >= (i_size >> PAGE_SHIFT)) { + /* The folio straddles i_size. */ + unsigned int ofs = i_size & (folio_size(folio) - 1); + folio_zero_segment(folio, ofs, folio_size(folio)); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) return ntfs_write_mst_block(page, wbc); /* Normal, non-resident data stream. */ - return ntfs_write_block(page, wbc); + return ntfs_write_block(folio, wbc); } /* * Attribute is resident, implying it is not compressed, encrypted, or * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a page, so can simply return error on - * any pages with index above 0. Note the attribute can actually be + * record and hence smaller than a folio, so can simply return error on + * any folios with index above 0. Note the attribute can actually be * marked compressed but if it is resident the actual data is not * compressed so we are ok to ignore the compressed flag here. */ - BUG_ON(page_has_buffers(page)); - BUG_ON(!PageUptodate(page)); - if (unlikely(page->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " - "Aborting write.", page->index); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + BUG_ON(folio_buffers(folio)); + BUG_ON(!folio_test_uptodate(folio)); + if (unlikely(folio->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " + "Aborting write.", folio->index); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); return -EIO; } if (!NInoAttr(ni)) @@ -1454,12 +1443,12 @@ retry_writepage: if (unlikely(err)) goto err_out; /* - * Keep the VM happy. This must be done otherwise the radix-tree tag - * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + * Keep the VM happy. This must be done otherwise + * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); i_size = i_size_read(vi); if (unlikely(attr_len > i_size)) { @@ -1474,18 +1463,18 @@ retry_writepage: /* Shrinking cannot fail. */ BUG_ON(err); } - addr = kmap_atomic(page); - /* Copy the data from the page to the mft record. */ + addr = kmap_local_folio(folio, 0); + /* Copy the data from the folio to the mft record. */ memcpy((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset), addr, attr_len); - /* Zero out of bounds area in the page cache page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - kunmap_atomic(addr); - flush_dcache_page(page); + /* Zero out of bounds area in the page cache folio. */ + memset(addr + attr_len, 0, folio_size(folio) - attr_len); + kunmap_local(addr); + flush_dcache_folio(folio); flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the page. */ - end_page_writeback(page); + /* We are done with the folio. */ + folio_end_writeback(folio); /* Finally, mark the mft record dirty, so it gets written back. */ mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); @@ -1496,18 +1485,18 @@ err_out: ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " "page so we try again later."); /* - * Put the page back on mapping->dirty_pages, but leave its + * Put the folio back on mapping->dirty_pages, but leave its * buffers' dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " "error %i.", err); - SetPageError(page); + folio_set_error(folio); NVolSetErrors(ni->vol); } - unlock_page(page); + folio_unlock(folio); if (ctx) ntfs_attr_put_search_ctx(ctx); if (m) -- cgit v1.2.3 From 24a7b35285c5514393e7ed46407111c68f4602ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:06 +0100 Subject: ntfs: convert ntfs_prepare_pages_for_non_resident_write() to folios Convert each element of the pages array to a folio before using it. This in no way renders the function large-folio safe, but it does remove a lot of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/file.c | 89 +++++++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 48 deletions(-) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index cbc545999cfe..099141d20db6 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, LCN lcn; s64 bh_pos, vcn_len, end, initialized_size; sector_t lcn_block; - struct page *page; + struct folio *folio; struct inode *vi; ntfs_inode *ni, *base_ni = NULL; ntfs_volume *vol; @@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, (long long)pos, bytes); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - u = 0; - do { - page = pages[u]; - BUG_ON(!page); - /* - * create_empty_buffers() will create uptodate/dirty buffers if - * the page is uptodate/dirty. - */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) - return -ENOMEM; - } - } while (++u < nr_pages); rl_write_locked = false; rl = NULL; err = 0; @@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, end = pos + bytes; cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; /* - * Loop over each page and for each page over each buffer. Use goto to + * Loop over each buffer in each folio. Use goto to * reduce indentation. */ u = 0; -do_next_page: - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); +do_next_folio: + folio = page_folio(pages[u]); + bh_pos = folio_pos(folio); + head = folio_buffers(folio); + if (!head) + /* + * create_empty_buffers() will create uptodate/dirty + * buffers if the folio is uptodate/dirty. + */ + head = folio_create_empty_buffers(folio, blocksize, 0); + bh = head; do { VCN cdelta; s64 bh_end; @@ -653,15 +646,15 @@ do_next_page: if (buffer_uptodate(bh)) continue; /* - * The buffer is not uptodate. If the page is uptodate + * The buffer is not uptodate. If the folio is uptodate * set the buffer uptodate and otherwise ignore it. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } /* - * Neither the page nor the buffer are uptodate. If + * Neither the folio nor the buffer are uptodate. If * the buffer is only partially being written to, we * need to read it in before the write, i.e. now. */ @@ -679,7 +672,7 @@ do_next_page: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -706,7 +699,7 @@ map_buffer_cached: (bh_cofs >> blocksize_bits); set_buffer_mapped(bh); /* - * If the page is uptodate so is the buffer. If the + * If the folio is uptodate so is the buffer. If the * buffer is fully outside the write, we ignore it if * it was already allocated and we mark it dirty so it * gets written out if we allocated it. On the other @@ -714,7 +707,7 @@ map_buffer_cached: * marking it dirty we set buffer_new so we can do * error recovery. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); if (unlikely(was_hole)) { @@ -754,7 +747,8 @@ map_buffer_cached: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, + bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -773,7 +767,7 @@ map_buffer_cached: */ if (bh_end <= pos || bh_pos >= end) { if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -786,7 +780,7 @@ map_buffer_cached: u8 *kaddr; unsigned pofs; - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (bh_pos < pos) { pofs = bh_pos & ~PAGE_MASK; memset(kaddr + pofs, 0, pos - bh_pos); @@ -795,8 +789,8 @@ map_buffer_cached: pofs = end & ~PAGE_MASK; memset(kaddr + pofs, 0, bh_end - end); } - kunmap_atomic(kaddr); - flush_dcache_page(page); + kunmap_local(kaddr); + flush_dcache_folio(folio); } continue; } @@ -809,11 +803,12 @@ map_buffer_cached: initialized_size = ni->allocated_size; read_unlock_irqrestore(&ni->size_lock, flags); if (bh_pos > initialized_size) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } continue; @@ -927,17 +922,17 @@ rl_not_mapped_enoent: bh->b_blocknr = -1; /* * If the buffer is uptodate we skip it. If it - * is not but the page is uptodate, we can set - * the buffer uptodate. If the page is not + * is not but the folio is uptodate, we can set + * the buffer uptodate. If the folio is not * uptodate, we can clear the buffer and set it * uptodate. Whether this is worthwhile is * debatable and this could be removed. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -1167,7 +1162,7 @@ rl_not_mapped_enoent: } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); /* If there are no errors, do the next page. */ if (likely(!err && ++u < nr_pages)) - goto do_next_page; + goto do_next_folio; /* If there are no errors, release the runlist lock if we took it. */ if (likely(!err)) { if (unlikely(rl_write_locked)) { @@ -1185,9 +1180,8 @@ rl_not_mapped_enoent: bh = *--wait_bh; wait_on_buffer(bh); if (likely(buffer_uptodate(bh))) { - page = bh->b_page; - bh_pos = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); + folio = bh->b_folio; + bh_pos = folio_pos(folio) + bh_offset(bh); /* * If the buffer overflows the initialized size, need * to zero the overflowing region. @@ -1197,7 +1191,7 @@ rl_not_mapped_enoent: if (likely(bh_pos < initialized_size)) ofs = initialized_size - bh_pos; - zero_user_segment(page, bh_offset(bh) + ofs, + folio_zero_segment(folio, bh_offset(bh) + ofs, blocksize); } } else /* if (unlikely(!buffer_uptodate(bh))) */ @@ -1324,21 +1318,20 @@ rl_not_mapped_enoent: u = 0; end = bh_cpos << vol->cluster_size_bits; do { - page = pages[u]; - bh = head = page_buffers(page); + folio = page_folio(pages[u]); + bh = head = folio_buffers(folio); do { if (u == nr_pages && - ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh) >= end) + folio_pos(folio) + bh_offset(bh) >= end) break; if (!buffer_new(bh)) continue; clear_buffer_new(bh); if (!buffer_uptodate(bh)) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } -- cgit v1.2.3 From c3f4200ac61ae98ea29d28519ea5076c04f22e06 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:07 +0100 Subject: ntfs3: convert ntfs_zero_range() to use a folio Use the folio API throughout, saving six hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 962f12ce6c0a..a003a69091a2 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) struct buffer_head *head, *bh; u32 bh_next, bh_off, to; sector_t iblock; - struct page *page; + struct folio *folio; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) PAGE_SIZE; iblock = page_off >> inode->i_blkbits; - page = find_or_create_page(mapping, idx, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, idx, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); - bh = head = page_buffers(page); + bh = head; bh_off = 0; do { bh_next = bh_off + blocksize; @@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } /* Ok, it's mapped. Make sure it's up-to-date. */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = bh_read(bh, 0); if (err < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } } @@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); cond_resched(); } out: -- cgit v1.2.3 From 414ae0a440333143dcac39faaef00f098cceeff5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:08 +0100 Subject: ocfs2: convert ocfs2_map_page_blocks to use a folio Convert the page argument to a folio and then use the folio APIs throughout. Replaces three hidden calls to compound_head() with one explicit one. Link: https://lkml.kernel.org/r/20231016201114.1928083-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ocfs2/aops.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0fdba30740ab..95d1e70b4401 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page, * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ -static int ocfs2_should_read_blk(struct inode *inode, struct page *page, +static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, unsigned int block_start) { - u64 offset = page_offset(page) + block_start; + u64 offset = folio_pos(folio) + block_start; if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) return 1; @@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { + struct folio *folio = page_folio(page); int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; unsigned int bsize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, bsize, 0); - head = page_buffers(page); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; @@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * they may belong to unallocated clusters. */ if (block_start >= to || block_end <= from) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); continue; } @@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, clean_bdev_bh_alias(bh); } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && - ocfs2_should_read_blk(inode, page, block_start) && + ocfs2_should_read_blk(inode, folio, block_start) && (block_start < from || block_end > to)) { bh_read_nowait(bh, 0); *wait_bh++=bh; @@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user(page, block_start, bh->b_size); + folio_zero_range(folio, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); -- cgit v1.2.3 From 44f68575267ecfc439d11bb782c69ba2598b3ed0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:09 +0100 Subject: reiserfs: convert writepage to use a folio Convert the incoming page to a folio and then use it throughout the writeback path. This definitely isn't enough to support large folios, but I don't expect reiserfs to gain support for those before it is removed. Link: https://lkml.kernel.org/r/20231016201114.1928083-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/reiserfs/inode.c | 80 ++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 86e55d4bb10d..d9737235b8e0 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2507,10 +2507,10 @@ out: * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ -static int reiserfs_write_full_page(struct page *page, +static int reiserfs_write_full_folio(struct folio *folio, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned long end_index = inode->i_size >> PAGE_SHIFT; int error = 0; unsigned long block; @@ -2518,7 +2518,7 @@ static int reiserfs_write_full_page(struct page *page, struct buffer_head *head, *bh; int partial = 0; int nr = 0; - int checked = PageChecked(page); + int checked = folio_test_checked(folio); struct reiserfs_transaction_handle th; struct super_block *s = inode->i_sb; int bh_per_page = PAGE_SIZE / s->s_blocksize; @@ -2526,47 +2526,46 @@ static int reiserfs_write_full_page(struct page *page, /* no logging allowed when nonblocking or from PF_MEMALLOC */ if (checked && (current->flags & PF_MEMALLOC)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return 0; } /* - * The page dirty bit is cleared before writepage is called, which + * The folio dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers - * The page really should be up to date at this point, so tossing + * The folio really should be up to date at this point, so tossing * in the BH_Uptodate is just a sanity check. */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, s->s_blocksize, + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, s->s_blocksize, (1 << BH_Dirty) | (1 << BH_Uptodate)); - } - head = page_buffers(page); /* - * last page in the file, zero out any contents past the + * last folio in the file, zero out any contents past the * last byte in the file */ - if (page->index >= end_index) { + if (folio->index >= end_index) { unsigned last_offset; last_offset = inode->i_size & (PAGE_SIZE - 1); - /* no file contents in this page */ - if (page->index >= end_index + 1 || !last_offset) { - unlock_page(page); + /* no file contents in this folio */ + if (folio->index >= end_index + 1 || !last_offset) { + folio_unlock(folio); return 0; } - zero_user_segment(page, last_offset, PAGE_SIZE); + folio_zero_segment(folio, last_offset, folio_size(folio)); } bh = head; - block = page->index << (PAGE_SHIFT - s->s_blocksize_bits); + block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits); last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; /* first map all the buffers, logging any direct items we find */ do { if (block > last_block) { /* * This can happen when the block size is less than - * the page size. The corresponding bytes in the page + * the folio size. The corresponding bytes in the folio * were zero filled above */ clear_buffer_dirty(bh); @@ -2593,7 +2592,7 @@ static int reiserfs_write_full_page(struct page *page, * blocks we're going to log */ if (checked) { - ClearPageChecked(page); + folio_clear_checked(folio); reiserfs_write_lock(s); error = journal_begin(&th, s, bh_per_page + 1); if (error) { @@ -2602,7 +2601,7 @@ static int reiserfs_write_full_page(struct page *page, } reiserfs_update_inode_transaction(inode); } - /* now go through and lock any dirty buffers on the page */ + /* now go through and lock any dirty buffers on the folio */ do { get_bh(bh); if (!buffer_mapped(bh)) @@ -2623,7 +2622,7 @@ static int reiserfs_write_full_page(struct page *page, lock_buffer(bh); } else { if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } } @@ -2640,13 +2639,13 @@ static int reiserfs_write_full_page(struct page *page, if (error) goto fail; } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); /* - * since any buffer might be the only dirty buffer on the page, - * the first submit_bh can bring the page out of writeback. + * since any buffer might be the only dirty buffer on the folio, + * the first submit_bh can bring the folio out of writeback. * be careful with the buffers. */ do { @@ -2663,10 +2662,10 @@ static int reiserfs_write_full_page(struct page *page, done: if (nr == 0) { /* - * if this page only had a direct item, it is very possible for + * if this folio only had a direct item, it is very possible for * no io to be required without there being an error. Or, * someone else could have locked them and sent them down the - * pipe without locking the page + * pipe without locking the folio */ bh = head; do { @@ -2677,18 +2676,18 @@ done: bh = bh->b_this_page; } while (bh != head); if (!partial) - SetPageUptodate(page); - end_page_writeback(page); + folio_mark_uptodate(folio); + folio_end_writeback(folio); } return error; fail: /* * catches various errors, we need to make sure any valid dirty blocks - * get to the media. The page is currently locked and not marked for + * get to the media. The folio is currently locked and not marked for * writeback */ - ClearPageUptodate(page); + folio_clear_uptodate(folio); bh = head; do { get_bh(bh); @@ -2698,16 +2697,16 @@ fail: } else { /* * clear any dirty bits that might have come from - * getting attached to a dirty page + * getting attached to a dirty folio */ clear_buffer_dirty(bh); } bh = bh->b_this_page; } while (bh != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -2728,9 +2727,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio) static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; reiserfs_wait_on_write_block(inode->i_sb); - return reiserfs_write_full_page(page, wbc); + return reiserfs_write_full_folio(folio, wbc); } static void reiserfs_truncate_failed_write(struct inode *inode) -- cgit v1.2.3 From 5fb7bd50b3516a9d5fea5775e7ebd43f0c96bb3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:10 +0100 Subject: ufs: add ufs_get_locked_folio and ufs_put_locked_folio Convert the _page variants to call them. Saves a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/util.c | 43 +++++++++++++++++++++++++------------------ fs/ufs/util.h | 13 +++++++++---- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 08ddf41eaaad..151b400cb3b6 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -229,43 +229,50 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); } +struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) +{ + struct folio *folio = ufs_get_locked_folio(mapping, index); + + if (folio) + return folio_file_page(folio, index); + return NULL; +} + /** - * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist + * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. * @mapping: the address_space to search * @index: the page index * - * Locates the desired pagecache page, if not exist we'll read it, + * Locates the desired pagecache folio, if not exist we'll read it, * locks it, increments its reference * count and returns its address. * */ - -struct page *ufs_get_locked_page(struct address_space *mapping, +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index) { struct inode *inode = mapping->host; - struct page *page = find_lock_page(mapping, index); - if (!page) { - page = read_mapping_page(mapping, index, NULL); + struct folio *folio = filemap_lock_folio(mapping, index); + if (!folio) { + folio = read_mapping_folio(mapping, index, NULL); - if (IS_ERR(page)) { - printk(KERN_ERR "ufs_change_blocknr: " - "read_mapping_page error: ino %lu, index: %lu\n", + if (IS_ERR(folio)) { + printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - return page; + return folio; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping == NULL)) { + if (unlikely(folio->mapping == NULL)) { /* Truncate got there first */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } } - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - return page; + if (!folio_buffers(folio)) + folio_create_empty_buffers(folio, 1 << inode->i_blkbits, 0); + return folio; } diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 89247193d96d..62542561d150 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -273,12 +273,17 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ -extern struct page *ufs_get_locked_page(struct address_space *mapping, - pgoff_t index); +struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index); +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); +static inline void ufs_put_locked_folio(struct folio *folio) +{ + folio_unlock(folio); + folio_put(folio); +} + static inline void ufs_put_locked_page(struct page *page) { - unlock_page(page); - put_page(page); + ufs_put_locked_folio(page_folio(page)); } -- cgit v1.2.3 From e7ca7f1725b3b0b276dd7c5119326b5e098abb53 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:11 +0100 Subject: ufs: use ufs_get_locked_folio() in ufs_alloc_lastblock() Switch to the folio APIs, saving one folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 21a4779a2de5..5b289374d4fd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1057,7 +1057,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned i, end; sector_t lastfrag; - struct page *lastpage; + struct folio *folio; struct buffer_head *bh; u64 phys64; @@ -1068,18 +1068,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) lastfrag--; - lastpage = ufs_get_locked_page(mapping, lastfrag >> + folio = ufs_get_locked_folio(mapping, lastfrag >> (PAGE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; + if (IS_ERR(folio)) { + err = -EIO; + goto out; + } + end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); + bh = folio_buffers(folio); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; err = ufs_getfrag_block(inode, lastfrag, bh, 1); @@ -1095,7 +1094,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) */ set_buffer_uptodate(bh); mark_buffer_dirty(bh); - set_page_dirty(lastpage); + folio_mark_dirty(folio); } if (lastfrag >= UFS_IND_FRAGMENT) { @@ -1113,7 +1112,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) } } out_unlock: - ufs_put_locked_page(lastpage); + ufs_put_locked_folio(folio); out: return err; } -- cgit v1.2.3 From c7e8812ce5cfcb16c69faa86d38b36ddd3141ba9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:12 +0100 Subject: ufs: convert ufs_change_blocknr() to use folios Convert the locked_page argument to a folio, then use folios throughout. Saves three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/balloc.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2436e3f82147..53c11be2b2c1 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, sector_t newb, struct page *locked_page) { + struct folio *folio, *locked_folio = page_folio(locked_page); const unsigned blks_per_page = 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; @@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, pgoff_t index, cur_index, last_index; unsigned pos, j, lblock; sector_t end, i; - struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); - BUG_ON(!locked_page); - BUG_ON(!PageLocked(locked_page)); + BUG_ON(!folio_test_locked(locked_folio)); - cur_index = locked_page->index; + cur_index = locked_folio->index; end = count + beg; last_index = end >> (PAGE_SHIFT - inode->i_blkbits); for (i = beg; i < end; i = (i | mask) + 1) { index = i >> (PAGE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { - page = ufs_get_locked_page(mapping, index); - if (!page)/* it was truncated */ + folio = ufs_get_locked_folio(mapping, index); + if (!folio) /* it was truncated */ continue; - if (IS_ERR(page)) {/* or EIO */ + if (IS_ERR(folio)) {/* or EIO */ ufs_error(inode->i_sb, __func__, "read of page %llu failed\n", (unsigned long long)index); continue; } } else - page = locked_page; + folio = locked_folio; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; pos = i & mask; for (j = 0; j < pos; ++j) bh = bh->b_this_page; - if (unlikely(index == last_index)) lblock = end & mask; else @@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, } while (bh != head); if (likely(cur_index != index)) - ufs_put_locked_page(page); + ufs_put_locked_folio(folio); } UFSD("EXIT\n"); } -- cgit v1.2.3 From c9f2480ed7b221baf738edf431757e5ca4115bca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:13 +0100 Subject: ufs: remove ufs_get_locked_page() Both callers are now converted to ufs_get_locked_folio(). Link: https://lkml.kernel.org/r/20231016201114.1928083-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/util.c | 9 --------- fs/ufs/util.h | 7 ------- 2 files changed, 16 deletions(-) diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 151b400cb3b6..d32de30009a0 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -229,15 +229,6 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); } -struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) -{ - struct folio *folio = ufs_get_locked_folio(mapping, index); - - if (folio) - return folio_file_page(folio, index); - return NULL; -} - /** * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 62542561d150..0ecd2ed792f5 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -273,7 +273,6 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ -struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index); struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); static inline void ufs_put_locked_folio(struct folio *folio) { @@ -281,12 +280,6 @@ static inline void ufs_put_locked_folio(struct folio *folio) folio_put(folio); } -static inline void ufs_put_locked_page(struct page *page) -{ - ufs_put_locked_folio(page_folio(page)); -} - - /* * macros and inline function to get important structures from ufs_sb_private_info */ -- cgit v1.2.3 From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:14 +0100 Subject: buffer: remove folio_create_empty_buffers() With all users converted, remove the old create_empty_buffers() and rename folio_create_empty_buffers() to create_empty_buffers(). Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 13 +++---------- fs/ext4/inode.c | 6 +++--- fs/ext4/move_extent.c | 4 ++-- fs/gfs2/aops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/gfs2/meta_io.c | 2 +- fs/gfs2/quota.c | 2 +- fs/mpage.c | 2 +- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/page.c | 4 ++-- fs/nilfs2/segment.c | 2 +- fs/ntfs/aops.c | 4 ++-- fs/ntfs/file.c | 2 +- fs/ntfs3/file.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/ufs/util.c | 2 +- include/linux/buffer_head.h | 4 +--- 18 files changed, 25 insertions(+), 34 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index bfa7604d1891..657a62bab73d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,7 +1641,7 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -struct buffer_head *folio_create_empty_buffers(struct folio *folio, +struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; @@ -1672,13 +1672,6 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio, return head; } -EXPORT_SYMBOL(folio_create_empty_buffers); - -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) -{ - folio_create_empty_buffers(page_folio(page), blocksize, b_state); -} EXPORT_SYMBOL(create_empty_buffers); /** @@ -1778,7 +1771,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, 1 << READ_ONCE(inode->i_blkbits), b_state); return bh; } @@ -2681,7 +2674,7 @@ int block_truncate_page(struct address_space *mapping, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, blocksize, 0); + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8e431ff2fd95..347fc8986e93 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1021,7 +1021,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1151,7 +1151,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -3642,7 +3642,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, blocksize, 0); + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7fe448fb948b..3aa57376d9c2 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -184,7 +184,7 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; @@ -380,7 +380,7 @@ data_copy: * but keeping in mind that i_size will not change */ bh = folio_buffers(folio[0]); if (!bh) - bh = folio_create_empty_buffers(folio[0], + bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index c26d48355cc2..6b060fc9e260 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio, if (folio_test_checked(folio)) { folio_clear_checked(folio); if (!folio_buffers(folio)) { - folio_create_empty_buffers(folio, + create_empty_buffers(folio, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 247d2c16593c..f1eee3f4704b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -71,7 +71,7 @@ static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, BIT(inode->i_blkbits), BIT(BH_Uptodate)); if (!buffer_mapped(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f6d40d51f5ed..25ceb0805df2 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -134,7 +134,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) mapping_gfp_mask(mapping) | __GFP_NOFAIL); bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, sdp->sd_sb.sb_bsize, 0); } else { folio = __filemap_get_folio(mapping, index, diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 25354278cecb..2f1328af34f4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -763,7 +763,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, return PTR_ERR(folio); bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, bsize, 0); + bh = create_empty_buffers(folio, bsize, 0); for (;;) { /* Find the beginning block within the folio */ diff --git a/fs/mpage.c b/fs/mpage.c index 964a6efe594d..ffb064ed9d04 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -119,7 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, folio_mark_uptodate(folio); return; } - head = folio_create_empty_buffers(folio, i_blocksize(inode), 0); + head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 7b754e6494d7..c97c77a39668 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -568,7 +568,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) bh_frozen = folio_buffers(folio); if (!bh_frozen) - bh_frozen = folio_create_empty_buffers(folio, 1 << blkbits, 0); + bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0); bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 696215d899bf..06b04758f289 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -34,7 +34,7 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, struct buffer_head *bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, 1 << blkbits, b_state); + bh = create_empty_buffers(folio, 1 << blkbits, b_state); first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = get_nth_bh(bh, block - first_block); @@ -204,7 +204,7 @@ static void nilfs_copy_folio(struct folio *dst, struct folio *src, sbh = folio_buffers(src); dbh = folio_buffers(dst); if (!dbh) - dbh = folio_create_empty_buffers(dst, sbh->b_size, 0); + dbh = create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) mask |= BIT(BH_Dirty); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 94388fe83cf8..55e31cc903d1 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -732,7 +732,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, } head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, + head = create_empty_buffers(folio, i_blocksize(inode), 0); folio_unlock(folio); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index c4426992a2ee..71e31e789b29 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -189,7 +189,7 @@ static int ntfs_read_block(struct folio *folio) head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; /* @@ -555,7 +555,7 @@ static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) head = folio_buffers(folio); if (!head) { BUG_ON(!folio_test_uptodate(folio)); - head = folio_create_empty_buffers(folio, blocksize, + head = create_empty_buffers(folio, blocksize, (1 << BH_Uptodate) | (1 << BH_Dirty)); } bh = head; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 099141d20db6..297c0b9db621 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -625,7 +625,7 @@ do_next_folio: * create_empty_buffers() will create uptodate/dirty * buffers if the folio is uptodate/dirty. */ - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; do { VCN cdelta; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a003a69091a2..66fd4ac28395 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -203,7 +203,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; bh_off = 0; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 95d1e70b4401..a6405dd5df09 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -601,7 +601,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, bsize, 0); + head = create_empty_buffers(folio, bsize, 0); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d9737235b8e0..a9075c4843ed 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2539,7 +2539,7 @@ static int reiserfs_write_full_folio(struct folio *folio, */ head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, s->s_blocksize, + head = create_empty_buffers(folio, s->s_blocksize, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* diff --git a/fs/ufs/util.c b/fs/ufs/util.c index d32de30009a0..13ba34e6d64f 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -264,6 +264,6 @@ struct folio *ufs_get_locked_folio(struct address_space *mapping, } } if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, 1 << inode->i_blkbits, 0); + create_empty_buffers(folio, 1 << inode->i_blkbits, 0); return folio; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3d85a0cf0ca5..5f23ee599889 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -201,9 +201,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); -void create_empty_buffers(struct page *, unsigned long, - unsigned long b_state); -struct buffer_head *folio_create_empty_buffers(struct folio *folio, +struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 09aec5f9b2505c73a9380800f61ad920cc64a7cd Mon Sep 17 00:00:00 2001 From: Pedro Falcato Date: Mon, 16 Oct 2023 16:34:46 +0100 Subject: mm: kmsan: panic on failure to allocate early boot metadata Given large enough allocations and a machine with low enough memory (i.e a default QEMU VM), it's entirely possible that kmsan_init_alloc_meta_for_range's shadow+origin allocation fails. Instead of eating a NULL deref kernel oops, check explicitly for memblock_alloc() failure and panic with a nice error message. Alexander Potapenko said: For posterity, it is generally quite important for the allocated shadow and origin to be contiguous, otherwise an unaligned memory write may result in memory corruption (the corresponding unaligned shadow write will be assuming that shadow pages are adjacent). So instead of panicking we could have split the range into smaller ones until the allocation succeeds, but that would've led to hard-to-debug problems in the future. Link: https://lkml.kernel.org/r/20231016153446.132763-1-pedro.falcato@gmail.com Signed-off-by: Pedro Falcato Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 87318f9170f1..b9d05aff313e 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -285,12 +285,17 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) size = PAGE_ALIGN((u64)end - (u64)start); shadow = memblock_alloc(size, PAGE_SIZE); origin = memblock_alloc(size, PAGE_SIZE); + + if (!shadow || !origin) + panic("%s: Failed to allocate metadata memory for early boot range of size %llu", + __func__, size); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); - shadow_p = virt_to_page_or_null((char *)shadow + addr); + shadow_p = virt_to_page((char *)shadow + addr); set_no_shadow_origin_page(shadow_p); shadow_page_for(page) = shadow_p; - origin_p = virt_to_page_or_null((char *)origin + addr); + origin_p = virt_to_page((char *)origin + addr); set_no_shadow_origin_page(origin_p); origin_page_for(page) = origin_p; } -- cgit v1.2.3 From 1f4f7f0f8845dbac40289cc3d50b81314c5a12b8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Mon, 16 Oct 2023 19:31:03 +0800 Subject: mm/oom_killer: simplify OOM killer info dump helper There is only one caller wants to dump the kill victim info, so just let it call the standalone helper, no need to make the generic info dump helper take an extra argument for that. Result of bloat-o-meter: ./scripts/bloat-o-meter ./mm/oom_kill.old.o ./mm/oom_kill.o add/remove: 0/0 grow/shrink: 1/2 up/down: 131/-142 (-11) Function old new delta oom_kill_process 412 543 +131 out_of_memory 1422 1418 -4 dump_header 562 424 -138 Total: Before=21514, After=21503, chg -0.05% Link: https://lkml.kernel.org/r/20231016113103.86477-1-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Cc: Christian Brauner Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/oom_kill.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 44bde56ecd02..9e6071fde34a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -437,7 +437,7 @@ static void dump_tasks(struct oom_control *oc) } } -static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) +static void dump_oom_victim(struct oom_control *oc, struct task_struct *victim) { /* one line summary of the oom killer context. */ pr_info("oom-kill:constraint=%s,nodemask=%*pbl", @@ -449,7 +449,7 @@ static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim) from_kuid(&init_user_ns, task_uid(victim))); } -static void dump_header(struct oom_control *oc, struct task_struct *p) +static void dump_header(struct oom_control *oc) { pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, @@ -467,8 +467,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) } if (sysctl_oom_dump_tasks) dump_tasks(oc); - if (p) - dump_oom_summary(oc, p); } /* @@ -1029,8 +1027,10 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } task_unlock(victim); - if (__ratelimit(&oom_rs)) - dump_header(oc, victim); + if (__ratelimit(&oom_rs)) { + dump_header(oc); + dump_oom_victim(oc, victim); + } /* * Do we need to kill the entire memory cgroup? @@ -1072,7 +1072,7 @@ static void check_panic_on_oom(struct oom_control *oc) /* Do not panic for oom kills triggered by sysrq */ if (is_sysrq_oom(oc)) return; - dump_header(oc, NULL); + dump_header(oc); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -1155,7 +1155,7 @@ bool out_of_memory(struct oom_control *oc) select_bad_process(oc); /* Found nothing?!?! */ if (!oc->chosen) { - dump_header(oc, NULL); + dump_header(oc); pr_warn("Out of memory and no killable processes...\n"); /* * If we got here due to an actual allocation at the -- cgit v1.2.3 From ca71fe1ad9221a89c6a25f49159c600d9e598ae1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:54 +0800 Subject: mm, pcp: avoid to drain PCP when process exit Patch series "mm: PCP high auto-tuning", v3. The page allocation performance requirements of different workloads are often different. So, we need to tune the PCP (Per-CPU Pageset) high on each CPU automatically to optimize the page allocation performance. The list of patches in series is as follows, [1/9] mm, pcp: avoid to drain PCP when process exit [2/9] cacheinfo: calculate per-CPU data cache size [3/9] mm, pcp: reduce lock contention for draining high-order pages [4/9] mm: restrict the pcp batch scale factor to avoid too long latency [5/9] mm, page_alloc: scale the number of pages that are batch allocated [6/9] mm: add framework for PCP high auto-tuning [7/9] mm: tune PCP high automatically [8/9] mm, pcp: decrease PCP high if free pages < high watermark [9/9] mm, pcp: reduce detecting time of consecutive high order page freeing Patch [1/9], [2/9], [3/9] optimize the PCP draining for consecutive high-order pages freeing. Patch [4/9], [5/9] optimize batch freeing and allocating. Patch [6/9], [7/9], [8/9] implement and optimize a PCP high auto-tuning method. Patch [9/9] optimize the PCP draining for consecutive high order page freeing based on PCP high auto-tuning. The test results for patches with performance impact are as follows, kbuild ====== On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. build time lock contend% free_high alloc_zone ---------- ---------- --------- ---------- base 100.0 14.0 100.0 100.0 patch1 99.5 12.8 19.5 95.6 patch3 99.4 12.6 7.1 95.6 patch5 98.6 11.0 8.1 97.1 patch7 95.1 0.5 2.8 15.6 patch9 95.0 1.0 8.8 20.0 The PCP draining optimization (patch [1/9], [3/9]) and PCP batch allocation optimization (patch [5/9]) reduces zone lock contention a little. The PCP high auto-tuning (patch [7/9], [9/9]) reduces build time visibly. Where the tuning target: the number of pages allocated from zone reduces greatly. So, the zone contention cycles% reduces greatly. With PCP tuning patches (patch [7/9], [9/9]), the average used memory during test increases up to 18.4% because more pages are cached in PCP. But at the end of the test, the number of the used memory decreases to the same level as that of the base patch. That is, the pages cached in PCP will be released to zone after not being used actively. netperf SCTP_STREAM_MANY ======================== On a 2-socket Intel server with 128 logical CPU, we tested SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes. score lock contend% free_high alloc_zone cache miss rate% ----- ---------- --------- ---------- ---------------- base 100.0 2.1 100.0 100.0 1.3 patch1 99.4 2.1 99.4 99.4 1.3 patch3 106.4 1.3 13.3 106.3 1.3 patch5 106.0 1.2 13.2 105.9 1.3 patch7 103.4 1.9 6.7 90.3 7.6 patch9 108.6 1.3 13.7 108.6 1.3 The PCP draining optimization (patch [1/9]+[3/9]) improves performance. The PCP high auto-tuning (patch [7/9]) reduces performance a little because PCP draining cannot be triggered in time sometimes. So, the cache miss rate% increases. The further PCP draining optimization (patch [9/9]) based on PCP tuning restore the performance. lmbench3 UNIX (AF_UNIX) ======================= On a 2-socket Intel server with 128 logical CPU, we tested UNIX (AF_UNIX socket) test case of lmbench3 test suite with 16-pair processes. score lock contend% free_high alloc_zone cache miss rate% ----- ---------- --------- ---------- ---------------- base 100.0 51.4 100.0 100.0 0.2 patch1 116.8 46.1 69.5 104.3 0.2 patch3 199.1 21.3 7.0 104.9 0.2 patch5 200.0 20.8 7.1 106.9 0.3 patch7 191.6 19.9 6.8 103.8 2.8 patch9 193.4 21.7 7.0 104.7 2.1 The PCP draining optimization (patch [1/9], [3/9]) improves performance much. The PCP tuning (patch [7/9]) reduces performance a little because PCP draining cannot be triggered in time sometimes. The further PCP draining optimization (patch [9/9]) based on PCP tuning restores the performance partly. The patchset adds several fields in struct per_cpu_pages. The struct layout before/after the patchset is as follows, base ==== struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int batch; /* 12 4 */ short int free_factor; /* 16 2 */ short int expire; /* 18 2 */ /* XXX 4 bytes hole, try to pack */ struct list_head lists[13]; /* 24 208 */ /* size: 256, cachelines: 4, members: 7 */ /* sum members: 228, holes: 1, sum holes: 4 */ /* padding: 24 */ } __attribute__((__aligned__(64))); patched ======= struct per_cpu_pages { spinlock_t lock; /* 0 4 */ int count; /* 4 4 */ int high; /* 8 4 */ int high_min; /* 12 4 */ int high_max; /* 16 4 */ int batch; /* 20 4 */ u8 flags; /* 24 1 */ u8 alloc_factor; /* 25 1 */ u8 expire; /* 26 1 */ /* XXX 1 byte hole, try to pack */ short int free_count; /* 28 2 */ /* XXX 2 bytes hole, try to pack */ struct list_head lists[13]; /* 32 208 */ /* size: 256, cachelines: 4, members: 11 */ /* sum members: 237, holes: 2, sum holes: 3 */ /* padding: 16 */ } __attribute__((__aligned__(64))); The size of the struct doesn't changed with the patchset. This patch (of 9): In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when PCP is mostly used for high-order pages freeing to improve the cache-hot pages reusing between page allocation and freeing CPUs. But, the PCP draining mechanism may be triggered unexpectedly when process exits. With some customized trace point, it was found that PCP draining (free_high == true) was triggered with the order-1 page freeing with the following call stack, => free_unref_page_commit => free_unref_page => __mmdrop => exit_mm => do_exit => do_group_exit => __x64_sys_exit_group => do_syscall_64 Checking the source code, this is the page table PGD freeing (mm_free_pgd()). It's a order-1 page freeing if CONFIG_PAGE_TABLE_ISOLATION=y. Which is a common configuration for security. Just before that, page freeing with the following call stack was found, => free_unref_page_commit => free_unref_page_list => release_pages => tlb_batch_pages_flush => tlb_finish_mmu => exit_mmap => __mmput => exit_mm => do_exit => do_group_exit => __x64_sys_exit_group => do_syscall_64 So, when a process exits, - a large number of user pages of the process will be freed without page allocation, it's highly possible that pcp->free_factor becomes > 0. In fact, this is expected behavior to improve process exit performance. - after freeing all user pages, the PGD will be freed, which is a order-1 page freeing, PCP will be drained. All in all, when a process exits, it's high possible that the PCP will be drained. This is an unexpected behavior. To avoid this, in the patch, the PCP draining will only be triggered for 2 consecutive high-order page freeing. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the cycles% of the spinlock contention (mostly for zone lock) decreases from 14.0% to 12.8% (with PCP size == 367). The number of PCP draining for high order pages freeing (free_high) decreases 80.5%. This helps network workload too for reduced zone lock contention. On a 2-socket Intel server with 128 logical CPU, with the patch, the network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with 16-pair processes increase 16.8%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 51.4% to 46.1%. The number of PCP draining for high order pages freeing (free_high) decreases 30.5%. The cache miss rate keeps 0.2%. Link: https://lkml.kernel.org/r/20231016053002.756205-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20231016053002.756205-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 12 +++++++++++- mm/page_alloc.c | 11 ++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 486587fcd27f..de313f1c15f9 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -674,12 +674,22 @@ enum zone_watermarks { #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) +/* + * Flags used in pcp->flags field. + * + * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the + * previous page freeing. To avoid to drain PCP for an accident + * high-order page freeing. + */ +#define PCPF_PREV_FREE_HIGH_ORDER BIT(0) + struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ - short free_factor; /* batch scaling factor during free */ + u8 flags; /* protected by pcp->lock */ + u8 free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA short expire; /* When 0, remote pagesets are drained */ #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f46e519618a0..de547ef9a9ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2370,7 +2370,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, { int high; int pindex; - bool free_high; + bool free_high = false; __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); @@ -2383,8 +2383,13 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, * freeing without allocation. The remainder after bulk freeing * stops will be drained from vmstat refresh context. */ - free_high = (pcp->free_factor && order && order <= PAGE_ALLOC_COSTLY_ORDER); - + if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { + free_high = (pcp->free_factor && + (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER)); + pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; + } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { + pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; + } high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); -- cgit v1.2.3 From 94a3bfe4073cd88b05f7fb201ea7bf9dfa2cf5d5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:55 +0800 Subject: cacheinfo: calculate size of per-CPU data cache slice This can be used to estimate the size of the data cache slice that can be used by one CPU under ideal circumstances. Both DATA caches and UNIFIED caches are used in calculation. So, the users need to consider the impact of the code cache usage. Because the cache inclusive/non-inclusive information isn't available now, we just use the size of the per-CPU slice of LLC to make the result more predictable across architectures. This may be improved when more cache information is available in the future. A brute-force algorithm to iterate all online CPUs is used to avoid to allocate an extra cpumask, especially in offline callback. Link: https://lkml.kernel.org/r/20231016053002.756205-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Sudeep Holla Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Signed-off-by: Andrew Morton --- drivers/base/cacheinfo.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++- include/linux/cacheinfo.h | 1 + 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index cbae8be1fe52..585c66fce9d9 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -898,6 +898,48 @@ err: return rc; } +/* + * Calculate the size of the per-CPU data cache slice. This can be + * used to estimate the size of the data cache slice that can be used + * by one CPU under ideal circumstances. UNIFIED caches are counted + * in addition to DATA caches. So, please consider code cache usage + * when use the result. + * + * Because the cache inclusive/non-inclusive information isn't + * available, we just use the size of the per-CPU slice of LLC to make + * the result more predictable across architectures. + */ +static void update_per_cpu_data_slice_size_cpu(unsigned int cpu) +{ + struct cpu_cacheinfo *ci; + struct cacheinfo *llc; + unsigned int nr_shared; + + if (!last_level_cache_is_valid(cpu)) + return; + + ci = ci_cacheinfo(cpu); + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); + + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) + return; + + nr_shared = cpumask_weight(&llc->shared_cpu_map); + if (nr_shared) + ci->per_cpu_data_slice_size = llc->size / nr_shared; +} + +static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu) +{ + unsigned int icpu; + + for_each_online_cpu(icpu) { + if (!cpu_online && icpu == cpu) + continue; + update_per_cpu_data_slice_size_cpu(icpu); + } +} + static int cacheinfo_cpu_online(unsigned int cpu) { int rc = detect_cache_attributes(cpu); @@ -906,7 +948,11 @@ static int cacheinfo_cpu_online(unsigned int cpu) return rc; rc = cache_add_dev(cpu); if (rc) - free_cache_attributes(cpu); + goto err; + update_per_cpu_data_slice_size(true, cpu); + return 0; +err: + free_cache_attributes(cpu); return rc; } @@ -916,6 +962,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu) cpu_cache_sysfs_exit(cpu); free_cache_attributes(cpu); + update_per_cpu_data_slice_size(false, cpu); return 0; } diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h index a5cfd44fab45..d504eb4b49ab 100644 --- a/include/linux/cacheinfo.h +++ b/include/linux/cacheinfo.h @@ -73,6 +73,7 @@ struct cacheinfo { struct cpu_cacheinfo { struct cacheinfo *info_list; + unsigned int per_cpu_data_slice_size; unsigned int num_levels; unsigned int num_leaves; bool cpu_map_populated; -- cgit v1.2.3 From 362d37a106dd3f6431b2fdd91d9208b0d023b50d Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:56 +0800 Subject: mm, pcp: reduce lock contention for draining high-order pages In commit f26b3fa04611 ("mm/page_alloc: limit number of high-order pages on PCP during bulk free"), the PCP (Per-CPU Pageset) will be drained when PCP is mostly used for high-order pages freeing to improve the cache-hot pages reusing between page allocating and freeing CPUs. On system with small per-CPU data cache slice, pages shouldn't be cached before draining to guarantee cache-hot. But on a system with large per-CPU data cache slice, some pages can be cached before draining to reduce zone lock contention. So, in this patch, instead of draining without any caching, "pcp->batch" pages will be cached in PCP before draining if the size of the per-CPU data cache slice is more than "3 * batch". In theory, if the size of per-CPU data cache slice is more than "2 * batch", we can reuse cache-hot pages between CPUs. But considering the other usage of cache (code, other data accessing, etc.), "3 * batch" is used. Note: "3 * batch" is chosen to make sure the optimization works on recent x86_64 server CPUs. If you want to increase it, please check whether it breaks the optimization. On a 2-socket Intel server with 128 logical CPU, with the patch, the network bandwidth of the UNIX (AF_UNIX) test case of lmbench test suite with 16-pair processes increase 70.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 46.1% to 21.3%. The number of PCP draining for high order pages freeing (free_high) decreases 89.9%. The cache miss rate keeps 0.2%. Link: https://lkml.kernel.org/r/20231016053002.756205-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Sudeep Holla Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Signed-off-by: Andrew Morton --- drivers/base/cacheinfo.c | 2 ++ include/linux/gfp.h | 1 + include/linux/mmzone.h | 6 ++++++ mm/page_alloc.c | 38 +++++++++++++++++++++++++++++++++++++- 4 files changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/base/cacheinfo.c b/drivers/base/cacheinfo.c index 585c66fce9d9..f1e79263fe61 100644 --- a/drivers/base/cacheinfo.c +++ b/drivers/base/cacheinfo.c @@ -950,6 +950,7 @@ static int cacheinfo_cpu_online(unsigned int cpu) if (rc) goto err; update_per_cpu_data_slice_size(true, cpu); + setup_pcp_cacheinfo(); return 0; err: free_cache_attributes(cpu); @@ -963,6 +964,7 @@ static int cacheinfo_cpu_pre_down(unsigned int cpu) free_cache_attributes(cpu); update_per_cpu_data_slice_size(false, cpu); + setup_pcp_cacheinfo(); return 0; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 665f06675c83..665edc11fb9f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -325,6 +325,7 @@ void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); +void setup_pcp_cacheinfo(void); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index de313f1c15f9..efe72b3f7872 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -680,8 +680,14 @@ enum zone_watermarks { * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the * previous page freeing. To avoid to drain PCP for an accident * high-order page freeing. + * + * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before + * draining PCP for consecutive high-order pages freeing without + * allocation if data cache slice of CPU is large enough. To reduce + * zone lock contention and keep cache-hot pages reusing. */ #define PCPF_PREV_FREE_HIGH_ORDER BIT(0) +#define PCPF_FREE_HIGH_BATCH BIT(1) struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index de547ef9a9ad..b76b1de48a30 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include "internal.h" #include "shuffle.h" @@ -2385,7 +2386,9 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { free_high = (pcp->free_factor && - (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER)); + (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && + (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || + pcp->count >= READ_ONCE(pcp->batch))); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; @@ -5418,6 +5421,39 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } +static void zone_pcp_update_cacheinfo(struct zone *zone) +{ + int cpu; + struct per_cpu_pages *pcp; + struct cpu_cacheinfo *cci; + + for_each_online_cpu(cpu) { + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); + } +} + +void setup_pcp_cacheinfo(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zone_pcp_update_cacheinfo(zone); +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. -- cgit v1.2.3 From 52166607ecc980391b1fffbce0be3074a96d0c7b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:57 +0800 Subject: mm: restrict the pcp batch scale factor to avoid too long latency In page allocator, PCP (Per-CPU Pageset) is refilled and drained in batches to increase page allocation throughput, reduce page allocation/freeing latency per page, and reduce zone lock contention. But too large batch size will cause too long maximal allocation/freeing latency, which may punish arbitrary users. So the default batch size is chosen carefully (in zone_batchsize(), the value is 63 for zone > 1GB) to avoid that. In commit 3b12e7e97938 ("mm/page_alloc: scale the number of pages that are batch freed"), the batch size will be scaled for large number of page freeing to improve page freeing performance and reduce zone lock contention. Similar optimization can be used for large number of pages allocation too. To find out a suitable max batch scale factor (that is, max effective batch size), some tests and measurement on some machines were done as follows. A set of debug patches are implemented as follows, - Set PCP high to be 2 * batch to reduce the effect of PCP high - Disable free batch size scaling to get the raw performance. - The code with zone lock held is extracted from rmqueue_bulk() and free_pcppages_bulk() to 2 separate functions to make it easy to measure the function run time with ftrace function_graph tracer. - The batch size is hard coded to be 63 (default), 127, 255, 511, 1023, 2047, 4095. Then will-it-scale/page_fault1 is used to generate the page allocation/freeing workload. The page allocation/freeing throughput (page/s) is measured via will-it-scale. The page allocation/freeing average latency (alloc/free latency avg, in us) and allocation/freeing latency at 99 percentile (alloc/free latency 99%, in us) are measured with ftrace function_graph tracer. The test results are as follows, Sapphire Rapids Server ====================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 513633.4 2.33 3.57 2.67 6.83 127 517616.7 4.35 6.65 4.22 13.03 255 520822.8 8.29 13.32 7.52 25.24 511 524122.0 15.79 23.42 14.02 49.35 1023 525980.5 30.25 44.19 25.36 94.88 2047 526793.6 59.39 84.50 45.22 140.81 Ice Lake Server =============== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 620210.3 2.21 3.68 2.02 4.35 127 627003.0 4.09 6.86 3.51 8.28 255 630777.5 7.70 13.50 6.17 15.97 511 633651.5 14.85 22.62 11.66 31.08 1023 637071.1 28.55 42.02 20.81 54.36 2047 638089.7 56.54 84.06 39.28 91.68 Cascade Lake Server =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 404706.7 3.29 5.03 3.53 4.75 127 422475.2 6.12 9.09 6.36 8.76 255 411522.2 11.68 16.97 10.90 16.39 511 428124.1 22.54 31.28 19.86 32.25 1023 414718.4 43.39 62.52 40.00 66.33 2047 429848.7 86.64 120.34 71.14 106.08 Commet Lake Desktop =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 795183.13 2.18 3.55 2.03 3.05 127 803067.85 3.91 6.56 3.85 5.52 255 812771.10 7.35 10.80 7.14 10.20 511 817723.48 14.17 27.54 13.43 30.31 1023 818870.19 27.72 40.10 27.89 46.28 Coffee Lake Desktop =================== Batch throughput free latency free latency alloc latency alloc latency page/s avg / us 99% / us avg / us 99% / us ----- ---------- ------------ ------------ ------------- ------------- 63 510542.8 3.13 4.40 2.48 3.43 127 514288.6 5.97 7.89 4.65 6.04 255 516889.7 11.86 15.58 8.96 12.55 511 519802.4 23.10 28.81 16.95 26.19 1023 520802.7 45.30 52.51 33.19 45.95 2047 519997.1 90.63 104.00 65.26 81.74 From the above data, to restrict the allocation/freeing latency to be less than 100 us in most times, the max batch scale factor needs to be less than or equal to 5. Although it is reasonable to use 5 as max batch scale factor for the systems tested, there are also slower systems. Where smaller value should be used to constrain the page allocation/freeing latency. So, in this patch, a new kconfig option (PCP_BATCH_SCALE_MAX) is added to set the max batch scale factor. Whose default value is 5, and users can reduce it when necessary. Link: https://lkml.kernel.org/r/20231016053002.756205-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Andrew Morton Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- mm/Kconfig | 11 +++++++++++ mm/page_alloc.c | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 45f2971ef642..89971a894b60 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -705,6 +705,17 @@ config HUGETLB_PAGE_SIZE_VARIABLE config CONTIG_ALLOC def_bool (MEMORY_ISOLATION && COMPACTION) || CMA +config PCP_BATCH_SCALE_MAX + int "Maximum scale factor of PCP (Per-CPU pageset) batch allocate/free" + default 5 + range 0 6 + help + In page allocator, PCP (Per-CPU pageset) is refilled and drained in + batches. The batch number is scaled automatically to improve page + allocation/free throughput. But too large scale factor may hurt + latency. This option sets the upper limit of scale factor to limit + the maximum latency. + config PHYS_ADDR_T_64BIT def_bool 64BIT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b76b1de48a30..05a8d34827d2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2340,7 +2340,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) * freeing of pages without any allocation. */ batch <<= pcp->free_factor; - if (batch < max_nr_free) + if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->free_factor++; batch = clamp(batch, min_nr_free, max_nr_free); -- cgit v1.2.3 From c0a242394cb980bd00e1f61dc8aacb453d2bbe6a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:58 +0800 Subject: mm, page_alloc: scale the number of pages that are batch allocated When a task is allocating a large number of order-0 pages, it may acquire the zone->lock multiple times allocating pages in batches. This may unnecessarily contend on the zone lock when allocating very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent allocations. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the cycles% of the spinlock contention (mostly for zone lock) decreases from 12.6% to 11.0% (with PCP size == 367). Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mel Gorman Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 53 ++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index efe72b3f7872..e6cc2238c65f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -695,9 +695,10 @@ struct per_cpu_pages { int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ + u8 alloc_factor; /* batch scaling factor during allocate */ u8 free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA - short expire; /* When 0, remote pagesets are drained */ + u8 expire; /* When 0, remote pagesets are drained */ #endif /* Lists of pages, one per migrate type stored on the pcp-lists */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05a8d34827d2..6f6af835b05b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high = false; + /* + * On freeing, reduce the number of pages that are batch allocated. + * See nr_pcp_alloc() where alloc_factor is increased for subsequent + * allocations. + */ + pcp->alloc_factor >>= 1; __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); @@ -2679,6 +2685,42 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return page; } +static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) +{ + int high, batch, max_nr_alloc; + + high = READ_ONCE(pcp->high); + batch = READ_ONCE(pcp->batch); + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* + * Double the number of pages allocated each time there is subsequent + * allocation of order-0 pages without any freeing. + */ + if (!order) { + max_nr_alloc = max(high - pcp->count - batch, batch); + batch <<= pcp->alloc_factor; + if (batch <= max_nr_alloc && + pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) + pcp->alloc_factor++; + batch = min(batch, max_nr_alloc); + } + + /* + * Scale batch relative to order if batch implies free pages + * can be stored on the PCP. Batch can be 1 for small zones or + * for boot pagesets which should never store free pages as + * the pages may belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + + return batch; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, do { if (list_empty(list)) { - int batch = READ_ONCE(pcp->batch); + int batch = nr_pcp_alloc(pcp, order); int alloced; - /* - * Scale batch relative to order if batch implies - * free pages can be stored on the PCP. Batch can - * be 1 for small zones or for boot pagesets which - * should never store free pages as the pages may - * belong to arbitrary zones. - */ - if (batch > 1) - batch = max(batch >> order, 2); alloced = rmqueue_bulk(zone, order, batch, list, migratetype, alloc_flags); -- cgit v1.2.3 From 90b41691b9881376fe784e13b5766ec3676fdb55 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:29:59 +0800 Subject: mm: add framework for PCP high auto-tuning The page allocation performance requirements of different workloads are usually different. So, we need to tune PCP (per-CPU pageset) high to optimize the workload page allocation performance. Now, we have a system wide sysctl knob (percpu_pagelist_high_fraction) to tune PCP high by hand. But, it's hard to find out the best value by hand. And one global configuration may not work best for the different workloads that run on the same system. One solution to these issues is to tune PCP high of each CPU automatically. This patch adds the framework for PCP high auto-tuning. With it, pcp->high of each CPU will be changed automatically by tuning algorithm at runtime. The minimal high (pcp->high_min) is the original PCP high value calculated based on the low watermark pages. While the maximal high (pcp->high_max) is the PCP high value when percpu_pagelist_high_fraction sysctl knob is set to MIN_PERCPU_PAGELIST_HIGH_FRACTION. That is, the maximal pcp->high that can be set via sysctl knob by hand. It's possible that PCP high auto-tuning doesn't work well for some workloads. So, when PCP high is tuned by hand via the sysctl knob, the auto-tuning will be disabled. The PCP high set by hand will be used instead. This patch only adds the framework, so pcp->high will be set to pcp->high_min (original default) always. We will add actual auto-tuning algorithm in the following patches in the series. Link: https://lkml.kernel.org/r/20231016053002.756205-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 +++- mm/page_alloc.c | 71 ++++++++++++++++++++++++++++++++------------------ 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e6cc2238c65f..775abc899e80 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -693,6 +693,8 @@ struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ + int high_min; /* min high watermark */ + int high_max; /* max high watermark */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ @@ -852,7 +854,8 @@ struct zone { * the high and batch values are copied to individual pagesets for * faster access */ - int pageset_high; + int pageset_high_min; + int pageset_high_max; int pageset_batch; #ifndef CONFIG_SPARSEMEM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f6af835b05b..922836b6a01d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2350,7 +2350,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, bool free_high) { - int high = READ_ONCE(pcp->high); + int high = READ_ONCE(pcp->high_min); if (unlikely(!high || free_high)) return 0; @@ -2689,7 +2689,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) { int high, batch, max_nr_alloc; - high = READ_ONCE(pcp->high); + high = READ_ONCE(pcp->high_min); batch = READ_ONCE(pcp->batch); /* Check for PCP disabled or boot pageset */ @@ -5296,14 +5296,15 @@ static int zone_batchsize(struct zone *zone) } static int percpu_pagelist_high_fraction; -static int zone_highsize(struct zone *zone, int batch, int cpu_online) +static int zone_highsize(struct zone *zone, int batch, int cpu_online, + int high_fraction) { #ifdef CONFIG_MMU int high; int nr_split_cpus; unsigned long total_pages; - if (!percpu_pagelist_high_fraction) { + if (!high_fraction) { /* * By default, the high value of the pcp is based on the zone * low watermark so that if they are full then background @@ -5316,15 +5317,15 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * value is based on a fraction of the managed pages in the * zone. */ - total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction; + total_pages = zone_managed_pages(zone) / high_fraction; } /* * Split the high value across all online CPUs local to the zone. Note * that early in boot that CPUs may not be online yet and that during * CPU hotplug that the cpumask is not yet updated when a CPU is being - * onlined. For memory nodes that have no CPUs, split pcp->high across - * all online CPUs to mitigate the risk that reclaim is triggered + * onlined. For memory nodes that have no CPUs, split the high value + * across all online CPUs to mitigate the risk that reclaim is triggered * prematurely due to pages stored on pcp lists. */ nr_split_cpus = cpumask_weight(cpumask_of_node(zone_to_nid(zone))) + cpu_online; @@ -5352,19 +5353,21 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online) * However, guaranteeing these relations at all times would require e.g. write * barriers here but also careful usage of read barriers at the read side, and * thus be prone to error and bad for performance. Thus the update only prevents - * store tearing. Any new users of pcp->batch and pcp->high should ensure they - * can cope with those fields changing asynchronously, and fully trust only the - * pcp->count field on the local CPU with interrupts disabled. + * store tearing. Any new users of pcp->batch, pcp->high_min and pcp->high_max + * should ensure they can cope with those fields changing asynchronously, and + * fully trust only the pcp->count field on the local CPU with interrupts + * disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters * exist). */ -static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, - unsigned long batch) +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high_min, + unsigned long high_max, unsigned long batch) { WRITE_ONCE(pcp->batch, batch); - WRITE_ONCE(pcp->high, high); + WRITE_ONCE(pcp->high_min, high_min); + WRITE_ONCE(pcp->high_max, high_max); } static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats) @@ -5384,20 +5387,21 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta * need to be as careful as pageset_update() as nobody can access the * pageset yet. */ - pcp->high = BOOT_PAGESET_HIGH; + pcp->high_min = BOOT_PAGESET_HIGH; + pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; pcp->free_factor = 0; } -static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, - unsigned long batch) +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, + unsigned long high_max, unsigned long batch) { struct per_cpu_pages *pcp; int cpu; for_each_possible_cpu(cpu) { pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - pageset_update(pcp, high, batch); + pageset_update(pcp, high_min, high_max, batch); } } @@ -5407,19 +5411,34 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h */ static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online) { - int new_high, new_batch; + int new_high_min, new_high_max, new_batch; new_batch = max(1, zone_batchsize(zone)); - new_high = zone_highsize(zone, new_batch, cpu_online); + if (percpu_pagelist_high_fraction) { + new_high_min = zone_highsize(zone, new_batch, cpu_online, + percpu_pagelist_high_fraction); + /* + * PCP high is tuned manually, disable auto-tuning via + * setting high_min and high_max to the manual value. + */ + new_high_max = new_high_min; + } else { + new_high_min = zone_highsize(zone, new_batch, cpu_online, 0); + new_high_max = zone_highsize(zone, new_batch, cpu_online, + MIN_PERCPU_PAGELIST_HIGH_FRACTION); + } - if (zone->pageset_high == new_high && + if (zone->pageset_high_min == new_high_min && + zone->pageset_high_max == new_high_max && zone->pageset_batch == new_batch) return; - zone->pageset_high = new_high; + zone->pageset_high_min = new_high_min; + zone->pageset_high_max = new_high_max; zone->pageset_batch = new_batch; - __zone_set_pageset_high_and_batch(zone, new_high, new_batch); + __zone_set_pageset_high_and_batch(zone, new_high_min, new_high_max, + new_batch); } void __meminit setup_zone_pageset(struct zone *zone) @@ -5528,7 +5547,8 @@ __meminit void zone_pcp_init(struct zone *zone) */ zone->per_cpu_pageset = &boot_pageset; zone->per_cpu_zonestats = &boot_zonestats; - zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_high_min = BOOT_PAGESET_HIGH; + zone->pageset_high_max = BOOT_PAGESET_HIGH; zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) @@ -6430,13 +6450,14 @@ EXPORT_SYMBOL(free_contig_range); void zone_pcp_disable(struct zone *zone) { mutex_lock(&pcp_batch_high_lock); - __zone_set_pageset_high_and_batch(zone, 0, 1); + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); __drain_all_pages(zone, true); } void zone_pcp_enable(struct zone *zone) { - __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); mutex_unlock(&pcp_batch_high_lock); } -- cgit v1.2.3 From 51a755c56dc05a8b31ed28d24f28354946dc7529 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:00 +0800 Subject: mm: tune PCP high automatically The target to tune PCP high automatically is as follows, - Minimize allocation/freeing from/to shared zone - Minimize idle pages in PCP - Minimize pages in PCP if the system free pages is too few To reach these target, a tuning algorithm as follows is designed, - When we refill PCP via allocating from the zone, increase PCP high. Because if we had larger PCP, we could avoid to allocate from the zone. - In periodic vmstat updating kworker (via refresh_cpu_vm_stats()), decrease PCP high to try to free possible idle PCP pages. - When page reclaiming is active for the zone, stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. So, the PCP high can be tuned to the page allocating/freeing depth of workloads eventually. One issue of the algorithm is that if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small. But this isn't a severe issue, because there are no idle pages in this case. One alternative choice is to increase PCP high when we drain PCP via trying to free pages to the zone, but don't increase PCP high during PCP refilling. This can avoid the issue above. But if the number of pages allocated is much less than that of pages freed on a CPU, there will be many idle pages in PCP and it is hard to free these idle pages. 1/8 (>> 3) of PCP high will be decreased periodically. The value 1/8 is kind of arbitrary. Just to make sure that the idle PCP pages will be freed eventually. On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the build time decreases 3.5%. The cycles% of the spinlock contention (mostly for zone lock) decreases from 11.0% to 0.5%. The number of PCP draining for high order pages freeing (free_high) decreases 65.6%. The number of pages allocated from zone (instead of from PCP) decreases 83.9%. Link: https://lkml.kernel.org/r/20231016053002.756205-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Mel Gorman Suggested-by: Michal Hocko Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/gfp.h | 1 + mm/page_alloc.c | 119 +++++++++++++++++++++++++++++++++++++++++----------- mm/vmstat.c | 8 ++-- 3 files changed, 99 insertions(+), 29 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 665edc11fb9f..5b917e5b9350 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -320,6 +320,7 @@ extern void page_frag_free(void *addr); #define free_page(addr) free_pages((addr), 0) void page_alloc_init_cpuhp(void); +int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 922836b6a01d..83af76a8cef9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2157,6 +2157,40 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return i; } +/* + * Called from the vmstat counter updater to decay the PCP high. + * Return whether there are addition works to do. + */ +int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp) +{ + int high_min, to_drain, batch; + int todo = 0; + + high_min = READ_ONCE(pcp->high_min); + batch = READ_ONCE(pcp->batch); + /* + * Decrease pcp->high periodically to try to free possible + * idle PCP pages. And, avoid to free too many pages to + * control latency. This caps pcp->high decrement too. + */ + if (pcp->high > high_min) { + pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX), + pcp->high - (pcp->high >> 3), high_min); + if (pcp->high > high_min) + todo++; + } + + to_drain = pcp->count - pcp->high; + if (to_drain > 0) { + spin_lock(&pcp->lock); + free_pcppages_bulk(zone, to_drain, pcp, 0); + spin_unlock(&pcp->lock); + todo++; + } + + return todo; +} + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -2318,14 +2352,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) +static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free_high) { int min_nr_free, max_nr_free; - int batch = READ_ONCE(pcp->batch); - /* Free everything if batch freeing high-order pages. */ + /* Free as much as possible if batch freeing high-order pages. */ if (unlikely(free_high)) - return pcp->count; + return min(pcp->count, batch << CONFIG_PCP_BATCH_SCALE_MAX); /* Check for PCP disabled or boot pageset */ if (unlikely(high < batch)) @@ -2340,7 +2373,7 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) * freeing of pages without any allocation. */ batch <<= pcp->free_factor; - if (batch < max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) + if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->free_factor++; batch = clamp(batch, min_nr_free, max_nr_free); @@ -2348,28 +2381,48 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) } static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, - bool free_high) + int batch, bool free_high) { - int high = READ_ONCE(pcp->high_min); + int high, high_min, high_max; + + high_min = READ_ONCE(pcp->high_min); + high_max = READ_ONCE(pcp->high_max); + high = pcp->high = clamp(pcp->high, high_min, high_max); - if (unlikely(!high || free_high)) + if (unlikely(!high)) return 0; - if (!test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) - return high; + if (unlikely(free_high)) { + pcp->high = max(high - (batch << CONFIG_PCP_BATCH_SCALE_MAX), + high_min); + return 0; + } /* * If reclaim is active, limit the number of pages that can be * stored on pcp lists */ - return min(READ_ONCE(pcp->batch) << 2, high); + if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { + pcp->high = max(high - (batch << pcp->free_factor), high_min); + return min(batch << 2, pcp->high); + } + + if (pcp->count >= high && high_min != high_max) { + int need_high = (batch << pcp->free_factor) + batch; + + /* pcp->high should be large enough to hold batch freed pages */ + if (pcp->high < need_high) + pcp->high = clamp(need_high, high_min, high_max); + } + + return high; } static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, struct page *page, int migratetype, unsigned int order) { - int high; + int high, batch; int pindex; bool free_high = false; @@ -2384,6 +2437,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; + batch = READ_ONCE(pcp->batch); /* * As high-order pages other than THP's stored on PCP can contribute * to fragmentation, limit the number stored when PCP is heavily @@ -2394,14 +2448,15 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, free_high = (pcp->free_factor && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(pcp->batch))); + pcp->count >= READ_ONCE(batch))); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; } - high = nr_pcp_high(pcp, zone, free_high); + high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); + free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), + pcp, pindex); } } @@ -2685,24 +2740,38 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return page; } -static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) +static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) { - int high, batch, max_nr_alloc; + int high, base_batch, batch, max_nr_alloc; + int high_max, high_min; - high = READ_ONCE(pcp->high_min); - batch = READ_ONCE(pcp->batch); + base_batch = READ_ONCE(pcp->batch); + high_min = READ_ONCE(pcp->high_min); + high_max = READ_ONCE(pcp->high_max); + high = pcp->high = clamp(pcp->high, high_min, high_max); /* Check for PCP disabled or boot pageset */ - if (unlikely(high < batch)) + if (unlikely(high < base_batch)) return 1; + if (order) + batch = base_batch; + else + batch = (base_batch << pcp->alloc_factor); + /* - * Double the number of pages allocated each time there is subsequent - * allocation of order-0 pages without any freeing. + * If we had larger pcp->high, we could avoid to allocate from + * zone. */ + if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + high = pcp->high = min(high + batch, high_max); + if (!order) { - max_nr_alloc = max(high - pcp->count - batch, batch); - batch <<= pcp->alloc_factor; + max_nr_alloc = max(high - pcp->count - base_batch, base_batch); + /* + * Double the number of pages allocated each time there is + * subsequent allocation of order-0 pages without any freeing. + */ if (batch <= max_nr_alloc && pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) pcp->alloc_factor++; @@ -2733,7 +2802,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, do { if (list_empty(list)) { - int batch = nr_pcp_alloc(pcp, order); + int batch = nr_pcp_alloc(pcp, zone, order); int alloced; alloced = rmqueue_bulk(zone, order, diff --git a/mm/vmstat.c b/mm/vmstat.c index 88ea95d4221c..359460deb377 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -816,9 +816,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) for_each_populated_zone(zone) { struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats; -#ifdef CONFIG_NUMA struct per_cpu_pages __percpu *pcp = zone->per_cpu_pageset; -#endif for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { int v; @@ -834,10 +832,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets) #endif } } -#ifdef CONFIG_NUMA if (do_pagesets) { cond_resched(); + + changes += decay_pcp_high(zone, this_cpu_ptr(pcp)); +#ifdef CONFIG_NUMA /* * Deal with draining the remote pageset of this * processor @@ -866,8 +866,8 @@ static int refresh_cpu_vm_stats(bool do_pagesets) drain_zone_pages(zone, this_cpu_ptr(pcp)); changes++; } - } #endif + } } for_each_online_pgdat(pgdat) { -- cgit v1.2.3 From 57c0419c5f0ea2ccab8700895c8fac20ba1eb21f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:01 +0800 Subject: mm, pcp: decrease PCP high if free pages < high watermark One target of PCP is to minimize pages in PCP if the system free pages is too few. To reach that target, when page reclaiming is active for the zone (ZONE_RECLAIM_ACTIVE), we will stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. But this may be too late because the background page reclaiming may introduce latency for some workloads. So, in this patch, during page allocation we will detect whether the number of free pages of the zone is below high watermark. If so, we will stop increasing PCP high in allocating path, decrease PCP high and free some pages in freeing path. With this, we can reduce the possibility of the premature background page reclaiming caused by too large PCP. The high watermark checking is done in allocating path to reduce the overhead in hotter freeing path. Link: https://lkml.kernel.org/r/20231016053002.756205-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 775abc899e80..b92ab001e146 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1016,6 +1016,7 @@ enum zone_flags { * Cleared when kswapd is woken. */ ZONE_RECLAIM_ACTIVE, /* kswapd may be scanning the zone. */ + ZONE_BELOW_HIGH, /* zone is below high watermark. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 83af76a8cef9..58ab8389da05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2407,7 +2407,13 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return min(batch << 2, pcp->high); } - if (pcp->count >= high && high_min != high_max) { + if (high_min == high_max) + return high; + + if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { + pcp->high = max(high - (batch << pcp->free_factor), high_min); + high = max(pcp->count, high_min); + } else if (pcp->count >= high) { int need_high = (batch << pcp->free_factor) + batch; /* pcp->high should be large enough to hold batch freed pages */ @@ -2457,6 +2463,10 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), pcp, pindex); + if (test_bit(ZONE_BELOW_HIGH, &zone->flags) && + zone_watermark_ok(zone, 0, high_wmark_pages(zone), + ZONE_MOVABLE, 0)) + clear_bit(ZONE_BELOW_HIGH, &zone->flags); } } @@ -2763,7 +2773,7 @@ static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone, int order) * If we had larger pcp->high, we could avoid to allocate from * zone. */ - if (high_min != high_max && !test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) + if (high_min != high_max && !test_bit(ZONE_BELOW_HIGH, &zone->flags)) high = pcp->high = min(high + batch, high_max); if (!order) { @@ -3225,6 +3235,25 @@ retry: } } + /* + * Detect whether the number of free pages is below high + * watermark. If so, we will decrease pcp->high and free + * PCP pages in free path to reduce the possibility of + * premature page reclaiming. Detection is done here to + * avoid to do that in hotter free path. + */ + if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) + goto check_alloc_wmark; + + mark = high_wmark_pages(zone); + if (zone_watermark_fast(zone, order, mark, + ac->highest_zoneidx, alloc_flags, + gfp_mask)) + goto try_this_zone; + else + set_bit(ZONE_BELOW_HIGH, &zone->flags); + +check_alloc_wmark: mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, ac->highest_zoneidx, alloc_flags, -- cgit v1.2.3 From 6ccdcb6d3a741c4e005ca6ffd4a62ddf8b5bead3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 16 Oct 2023 13:30:02 +0800 Subject: mm, pcp: reduce detecting time of consecutive high order page freeing In current PCP auto-tuning design, if the number of pages allocated is much more than that of pages freed on a CPU, the PCP high may become the maximal value even if the allocating/freeing depth is small, for example, in the sender of network workloads. If a CPU was used as sender originally, then it is used as receiver after context switching, we need to fill the whole PCP with maximal high before triggering PCP draining for consecutive high order freeing. This will hurt the performance of some network workloads. To solve the issue, in this patch, we will track the consecutive page freeing with a counter in stead of relying on PCP draining. So, we can detect consecutive page freeing much earlier. On a 2-socket Intel server with 128 logical CPU, we tested SCTP_STREAM_MANY test case of netperf test suite with 64-pair processes. With the patch, the network bandwidth improves 5.0%. This restores the performance drop caused by PCP auto-tuning. Link: https://lkml.kernel.org/r/20231016053002.756205-10-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Cc: David Hildenbrand Cc: Johannes Weiner Cc: Dave Hansen Cc: Michal Hocko Cc: Pavel Tatashin Cc: Matthew Wilcox Cc: Christoph Lameter Cc: Arjan van de Ven Cc: Sudeep Holla Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 +- mm/page_alloc.c | 27 +++++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b92ab001e146..3c25226beeed 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -698,10 +698,10 @@ struct per_cpu_pages { int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ u8 alloc_factor; /* batch scaling factor during allocate */ - u8 free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA u8 expire; /* When 0, remote pagesets are drained */ #endif + short free_count; /* consecutive free count */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[NR_PCP_LISTS]; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 58ab8389da05..d52718284029 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2369,13 +2369,10 @@ static int nr_pcp_free(struct per_cpu_pages *pcp, int batch, int high, bool free max_nr_free = high - batch; /* - * Double the number of pages freed each time there is subsequent - * freeing of pages without any allocation. + * Increase the batch number to the number of the consecutive + * freed pages to reduce zone lock contention. */ - batch <<= pcp->free_factor; - if (batch <= max_nr_free && pcp->free_factor < CONFIG_PCP_BATCH_SCALE_MAX) - pcp->free_factor++; - batch = clamp(batch, min_nr_free, max_nr_free); + batch = clamp_t(int, pcp->free_count, min_nr_free, max_nr_free); return batch; } @@ -2403,7 +2400,9 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, * stored on pcp lists */ if (test_bit(ZONE_RECLAIM_ACTIVE, &zone->flags)) { - pcp->high = max(high - (batch << pcp->free_factor), high_min); + int free_count = max_t(int, pcp->free_count, batch); + + pcp->high = max(high - free_count, high_min); return min(batch << 2, pcp->high); } @@ -2411,10 +2410,12 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone, return high; if (test_bit(ZONE_BELOW_HIGH, &zone->flags)) { - pcp->high = max(high - (batch << pcp->free_factor), high_min); + int free_count = max_t(int, pcp->free_count, batch); + + pcp->high = max(high - free_count, high_min); high = max(pcp->count, high_min); } else if (pcp->count >= high) { - int need_high = (batch << pcp->free_factor) + batch; + int need_high = pcp->free_count + batch; /* pcp->high should be large enough to hold batch freed pages */ if (pcp->high < need_high) @@ -2451,7 +2452,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, * stops will be drained from vmstat refresh context. */ if (order && order <= PAGE_ALLOC_COSTLY_ORDER) { - free_high = (pcp->free_factor && + free_high = (pcp->free_count >= batch && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || pcp->count >= READ_ONCE(batch))); @@ -2459,6 +2460,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; } + if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX)) + pcp->free_count += (1 << order); high = nr_pcp_high(pcp, zone, batch, free_high); if (pcp->count >= high) { free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high), @@ -2855,7 +2858,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, * See nr_pcp_free() where free_factor is increased for subsequent * frees. */ - pcp->free_factor >>= 1; + pcp->free_count >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); pcp_spin_unlock(pcp); @@ -5488,7 +5491,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta pcp->high_min = BOOT_PAGESET_HIGH; pcp->high_max = BOOT_PAGESET_HIGH; pcp->batch = BOOT_PAGESET_BATCH; - pcp->free_factor = 0; + pcp->free_count = 0; } static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high_min, -- cgit v1.2.3 From 7d0715d0d6b28a831b6fdfefb29c5a7a4929fa49 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:41 -0700 Subject: mm: kmem: optimize get_obj_cgroup_from_current() Patch series "mm: improve performance of accounted kernel memory allocations", v5. This patchset improves the performance of accounted kernel memory allocations by ~30% as measured by a micro-benchmark [1]. The benchmark is very straightforward: 1M of 64 bytes-large kmalloc() allocations. Below are results with the disabled kernel memory accounting, the original state and with this patchset applied. | | Kmem disabled | Original | Patched | Delta | |-------------+---------------+----------+---------+--------| | User cgroup | 29764 | 84548 | 59078 | -30.0% | | Root cgroup | 29742 | 48342 | 31501 | -34.8% | As we can see, the patchset removes the majority of the overhead when there is no actual accounting (a task belongs to the root memory cgroup) and almost halves the accounting overhead otherwise. The main idea is to get rid of unnecessary memcg to objcg conversions and switch to a scope-based protection of objcgs, which eliminates extra operations with objcg reference counters under a rcu read lock. More details are provided in individual commit descriptions. This patch (of 5): Manually inline memcg_kmem_bypass() and active_memcg() to speed up get_obj_cgroup_from_current() by avoiding duplicate in_task() checks and active_memcg() readings. Also add a likely() macro to __get_obj_cgroup_from_memcg(): obj_cgroup_tryget() should succeed at almost all times except a very unlikely race with the memcg deletion path. Link: https://lkml.kernel.org/r/20231019225346.1822282-1-roman.gushchin@linux.dev Link: https://lkml.kernel.org/r/20231019225346.1822282-2-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Acked-by: Johannes Weiner Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a86e7b445800..975ba766c16f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1086,19 +1086,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -static __always_inline bool memcg_kmem_bypass(void) -{ - /* Allow remote memcg charging from any context. */ - if (unlikely(active_memcg())) - return false; - - /* Memcg to charge can't be determined. */ - if (!in_task() || !current->mm || (current->flags & PF_KTHREAD)) - return true; - - return false; -} - /** * get_mem_cgroup_from_current - Obtain a reference on current task's memcg. */ @@ -3089,7 +3076,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); - if (objcg && obj_cgroup_tryget(objcg)) + if (likely(objcg && obj_cgroup_tryget(objcg))) break; objcg = NULL; } @@ -3098,16 +3085,23 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { - struct obj_cgroup *objcg = NULL; struct mem_cgroup *memcg; + struct obj_cgroup *objcg; - if (memcg_kmem_bypass()) - return NULL; + if (in_task()) { + memcg = current->active_memcg; + + /* Memcg to charge can't be determined. */ + if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD))) + return NULL; + } else { + memcg = this_cpu_read(int_active_memcg); + if (likely(!memcg)) + return NULL; + } rcu_read_lock(); - if (unlikely(active_memcg())) - memcg = active_memcg(); - else + if (!memcg) memcg = mem_cgroup_from_task(current); objcg = __get_obj_cgroup_from_memcg(memcg); rcu_read_unlock(); -- cgit v1.2.3 From 1aacbd354313f25c855e662e41c04e2abf71444a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:42 -0700 Subject: mm: kmem: add direct objcg pointer to task_struct To charge a freshly allocated kernel object to a memory cgroup, the kernel needs to obtain an objcg pointer. Currently it does it indirectly by obtaining the memcg pointer first and then calling to __get_obj_cgroup_from_memcg(). Usually tasks spend their entire life belonging to the same object cgroup. So it makes sense to save the objcg pointer on task_struct directly, so it can be obtained faster. It requires some work on fork, exit and cgroup migrate paths, but these paths are way colder. To avoid any costly synchronization the following rules are applied: 1) A task sets it's objcg pointer itself. 2) If a task is being migrated to another cgroup, the least significant bit of the objcg pointer is set atomically. 3) On the allocation path the objcg pointer is obtained locklessly using the READ_ONCE() macro and the least significant bit is checked. If it's set, the following procedure is used to update it locklessly: - task->objcg is zeroed using cmpxcg - new objcg pointer is obtained - task->objcg is updated using try_cmpxchg - operation is repeated if try_cmpxcg fails It guarantees that no updates will be lost if task migration is racing against objcg pointer update. It also allows to keep both read and write paths fully lockless. Because the task is keeping a reference to the objcg, it can't go away while the task is alive. This commit doesn't change the way the remote memcg charging works. Link: https://lkml.kernel.org/r/20231019225346.1822282-3-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Johannes Weiner Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/sched.h | 4 ++ mm/memcontrol.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 134 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..60de42715b56 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1443,6 +1443,10 @@ struct task_struct { struct mem_cgroup *active_memcg; #endif +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *objcg; +#endif + #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 975ba766c16f..96f4c319f025 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -249,6 +249,9 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) return container_of(vmpr, struct mem_cgroup, vmpressure); } +#define CURRENT_OBJCG_UPDATE_BIT 0 +#define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) + #ifdef CONFIG_MEMCG_KMEM static DEFINE_SPINLOCK(objcg_lock); @@ -3083,6 +3086,58 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) return objcg; } +static struct obj_cgroup *current_objcg_update(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *old, *objcg = NULL; + + do { + /* Atomically drop the update bit. */ + old = xchg(¤t->objcg, NULL); + if (old) { + old = (struct obj_cgroup *) + ((unsigned long)old & ~CURRENT_OBJCG_UPDATE_FLAG); + if (old) + obj_cgroup_put(old); + + old = NULL; + } + + /* If new objcg is NULL, no reason for the second atomic update. */ + if (!current->mm || (current->flags & PF_KTHREAD)) + return NULL; + + /* + * Release the objcg pointer from the previous iteration, + * if try_cmpxcg() below fails. + */ + if (unlikely(objcg)) { + obj_cgroup_put(objcg); + objcg = NULL; + } + + /* + * Obtain the new objcg pointer. The current task can be + * asynchronously moved to another memcg and the previous + * memcg can be offlined. So let's get the memcg pointer + * and try get a reference to objcg under a rcu read lock. + */ + + rcu_read_lock(); + memcg = mem_cgroup_from_task(current); + objcg = __get_obj_cgroup_from_memcg(memcg); + rcu_read_unlock(); + + /* + * Try set up a new objcg pointer atomically. If it + * fails, it means the update flag was set concurrently, so + * the whole procedure should be repeated. + */ + } while (!try_cmpxchg(¤t->objcg, &old, objcg)); + + return objcg; +} + __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) { struct mem_cgroup *memcg; @@ -3090,19 +3145,26 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) if (in_task()) { memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; - /* Memcg to charge can't be determined. */ - if (likely(!memcg) && (!current->mm || (current->flags & PF_KTHREAD))) - return NULL; + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + + if (objcg) { + obj_cgroup_get(objcg); + return objcg; + } } else { memcg = this_cpu_read(int_active_memcg); - if (likely(!memcg)) - return NULL; + if (unlikely(memcg)) + goto from_memcg; } + return NULL; +from_memcg: rcu_read_lock(); - if (!memcg) - memcg = mem_cgroup_from_task(current); objcg = __get_obj_cgroup_from_memcg(memcg); rcu_read_unlock(); return objcg; @@ -6440,6 +6502,7 @@ static void mem_cgroup_move_task(void) mem_cgroup_clear_mc(); } } + #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) { @@ -6453,8 +6516,39 @@ static void mem_cgroup_move_task(void) } #endif +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_fork(struct task_struct *task) +{ + /* + * Set the update flag to cause task->objcg to be initialized lazily + * on the first allocation. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = (struct obj_cgroup *)CURRENT_OBJCG_UPDATE_FLAG; +} + +static void mem_cgroup_exit(struct task_struct *task) +{ + struct obj_cgroup *objcg = task->objcg; + + objcg = (struct obj_cgroup *) + ((unsigned long)objcg & ~CURRENT_OBJCG_UPDATE_FLAG); + if (objcg) + obj_cgroup_put(objcg); + + /* + * Some kernel allocations can happen after this point, + * but let's ignore them. It can be done without any synchronization + * because it's always performed on the current task, so does + * current_objcg_update(). + */ + task->objcg = NULL; +} +#endif + #ifdef CONFIG_LRU_GEN -static void mem_cgroup_attach(struct cgroup_taskset *tset) +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; @@ -6472,10 +6566,31 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) task_unlock(task); } #else +static void mem_cgroup_lru_gen_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_LRU_GEN */ + +#ifdef CONFIG_MEMCG_KMEM +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + + cgroup_taskset_for_each(task, css, tset) { + /* atomically set the update bit */ + set_bit(CURRENT_OBJCG_UPDATE_BIT, (unsigned long *)&task->objcg); + } +} +#else +static void mem_cgroup_kmem_attach(struct cgroup_taskset *tset) {} +#endif /* CONFIG_MEMCG_KMEM */ + +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) static void mem_cgroup_attach(struct cgroup_taskset *tset) { + mem_cgroup_lru_gen_attach(tset); + mem_cgroup_kmem_attach(tset); } -#endif /* CONFIG_LRU_GEN */ +#endif static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { @@ -6885,9 +7000,15 @@ struct cgroup_subsys memory_cgrp_subsys = { .css_reset = mem_cgroup_css_reset, .css_rstat_flush = mem_cgroup_css_rstat_flush, .can_attach = mem_cgroup_can_attach, +#if defined(CONFIG_LRU_GEN) || defined(CONFIG_MEMCG_KMEM) .attach = mem_cgroup_attach, +#endif .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, +#ifdef CONFIG_MEMCG_KMEM + .fork = mem_cgroup_fork, + .exit = mem_cgroup_exit, +#endif .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, -- cgit v1.2.3 From 675d6c9b59e313ca2573c93e8fd87011a99bb8ce Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:43 -0700 Subject: mm: kmem: make memcg keep a reference to the original objcg Keep a reference to the original objcg object for the entire life of a memcg structure. This allows to simplify the synchronization on the kernel memory allocation paths: pinning a (live) memcg will also pin the corresponding objcg. The memory overhead of this change is minimal because object cgroups usually outlive their corresponding memory cgroups even without this change, so it's only an additional pointer per memcg. Link: https://lkml.kernel.org/r/20231019225346.1822282-4-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 8 +++++++- mm/memcontrol.c | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6674c12725d5..cc110cc8fdfc 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -289,7 +289,13 @@ struct mem_cgroup { #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; - struct obj_cgroup __rcu *objcg; + /* + * memcg->objcg is wiped out as a part of the objcg repaprenting + * process. memcg->orig_objcg preserves a pointer (and a reference) + * to the original objcg until the end of live of memcg. + */ + struct obj_cgroup __rcu *objcg; + struct obj_cgroup *orig_objcg; /* list of inherited objcgs, protected by objcg_lock */ struct list_head objcg_list; #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 96f4c319f025..ff036d5d339d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3899,6 +3899,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) objcg->memcg = memcg; rcu_assign_pointer(memcg->objcg, objcg); + obj_cgroup_get(objcg); + memcg->orig_objcg = objcg; static_branch_enable(&memcg_kmem_online_key); @@ -5406,6 +5408,9 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + if (memcg->orig_objcg) + obj_cgroup_put(memcg->orig_objcg); + for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); kfree(memcg->vmstats); -- cgit v1.2.3 From e86828e5446d95676835679837d995dec188d2be Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:44 -0700 Subject: mm: kmem: scoped objcg protection Switch to a scope-based protection of the objcg pointer on slab/kmem allocation paths. Instead of using the get_() semantics in the pre-allocation hook and put the reference afterwards, let's rely on the fact that objcg is pinned by the scope. It's possible because: 1) if the objcg is received from the current task struct, the task is keeping a reference to the objcg. 2) if the objcg is received from an active memcg (remote charging), the memcg is pinned by the scope and has a reference to the corresponding objcg. Link: https://lkml.kernel.org/r/20231019225346.1822282-5-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 9 +++++++++ include/linux/sched/mm.h | 4 ++++ mm/memcontrol.c | 47 ++++++++++++++++++++++++++++++++++++++++++++-- mm/slab.h | 15 ++++++++------- 4 files changed, 66 insertions(+), 9 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index cc110cc8fdfc..8006bc3bd7bf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1796,6 +1796,15 @@ bool mem_cgroup_kmem_disabled(void); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); +/* + * The returned objcg pointer is safe to use without additional + * protection within a scope. The scope is defined either by + * the current task (similar to the "current" global variable) + * or by set_active_memcg() pair. + * Please, use obj_cgroup_get() to get a reference if the pointer + * needs to be used outside of the local scope. + */ +struct obj_cgroup *current_obj_cgroup(void); struct obj_cgroup *get_obj_cgroup_from_current(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 8d89c8c4fac1..9a19f1b42f64 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -403,6 +403,10 @@ DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * + * Please, make sure that caller has a reference to the passed memcg structure, + * so its lifetime is guaranteed to exceed the scope between two + * set_active_memcg() calls. + * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ff036d5d339d..a6457c8b5e16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3170,6 +3170,49 @@ from_memcg: return objcg; } +__always_inline struct obj_cgroup *current_obj_cgroup(void) +{ + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; + + if (in_task()) { + memcg = current->active_memcg; + if (unlikely(memcg)) + goto from_memcg; + + objcg = READ_ONCE(current->objcg); + if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) + objcg = current_objcg_update(); + /* + * Objcg reference is kept by the task, so it's safe + * to use the objcg by the current task. + */ + return objcg; + } + + memcg = this_cpu_read(int_active_memcg); + if (unlikely(memcg)) + goto from_memcg; + + return NULL; + +from_memcg: + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { + /* + * Memcg pointer is protected by scope (see set_active_memcg()) + * and is pinning the corresponding objcg, so objcg can't go + * away and can be used within the scope without any additional + * protection. + */ + objcg = rcu_dereference_check(memcg->objcg, 1); + if (likely(objcg)) + break; + objcg = NULL; + } + + return objcg; +} + struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; @@ -3264,15 +3307,15 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) struct obj_cgroup *objcg; int ret = 0; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (objcg) { ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order); if (!ret) { + obj_cgroup_get(objcg); page->memcg_data = (unsigned long)objcg | MEMCG_DATA_KMEM; return 0; } - obj_cgroup_put(objcg); } return ret; } diff --git a/mm/slab.h b/mm/slab.h index 799a315695c6..3d07fb428393 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -484,7 +484,12 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + /* + * The obtained objcg pointer is safe to use within the current scope, + * defined by current task or set_active_memcg() pair. + * obj_cgroup_get() is used to get a permanent reference. + */ + objcg = current_obj_cgroup(); if (!objcg) return true; @@ -497,17 +502,14 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, css_put(&memcg->css); if (ret) - goto out; + return false; } if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) - goto out; + return false; *objcgp = objcg; return true; -out: - obj_cgroup_put(objcg); - return false; } static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, @@ -542,7 +544,6 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, obj_cgroup_uncharge(objcg, obj_full_size(s)); } } - obj_cgroup_put(objcg); } static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, -- cgit v1.2.3 From c63b835d0eafc956c43b8c6605708240ac52b8cd Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:45 -0700 Subject: percpu: scoped objcg protection Similar to slab and kmem, switch to a scope-based protection of the objcg pointer to avoid. Link: https://lkml.kernel.org/r/20231019225346.1822282-6-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Tested-by: Naresh Kamboju Acked-by: Shakeel Butt Reviewed-by: Vlastimil Babka Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/percpu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index a7665de8485f..f53ba692d67a 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1628,14 +1628,12 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT)) return true; - objcg = get_obj_cgroup_from_current(); + objcg = current_obj_cgroup(); if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { - obj_cgroup_put(objcg); + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) return false; - } *objcgp = objcg; return true; @@ -1649,6 +1647,7 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, return; if (likely(chunk && chunk->obj_cgroups)) { + obj_cgroup_get(objcg); chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); @@ -1657,7 +1656,6 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); - obj_cgroup_put(objcg); } } -- cgit v1.2.3 From e56808fef8f71a192b2740c0b6ea8be7ab865d54 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 19 Oct 2023 15:53:46 -0700 Subject: mm: kmem: reimplement get_obj_cgroup_from_current() Reimplement get_obj_cgroup_from_current() using current_obj_cgroup(). get_obj_cgroup_from_current() and current_obj_cgroup() share 80% of the code, so the new implementation is almost trivial. get_obj_cgroup_from_current() is a convenient function used by the bpf subsystem, so there is no reason to get rid of it completely. Link: https://lkml.kernel.org/r/20231019225346.1822282-7-roman.gushchin@linux.dev Signed-off-by: Roman Gushchin (Cruise) Reviewed-by: Vlastimil Babka Acked-by: Shakeel Butt Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Naresh Kamboju Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 11 ++++++++++- mm/memcontrol.c | 32 -------------------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 8006bc3bd7bf..b3d2e3e60eed 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1805,9 +1805,18 @@ void __memcg_kmem_uncharge_page(struct page *page, int order); * needs to be used outside of the local scope. */ struct obj_cgroup *current_obj_cgroup(void); -struct obj_cgroup *get_obj_cgroup_from_current(void); struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); +static inline struct obj_cgroup *get_obj_cgroup_from_current(void) +{ + struct obj_cgroup *objcg = current_obj_cgroup(); + + if (objcg) + obj_cgroup_get(objcg); + + return objcg; +} + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6457c8b5e16..8b0859b8cc03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3138,38 +3138,6 @@ static struct obj_cgroup *current_objcg_update(void) return objcg; } -__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) -{ - struct mem_cgroup *memcg; - struct obj_cgroup *objcg; - - if (in_task()) { - memcg = current->active_memcg; - if (unlikely(memcg)) - goto from_memcg; - - objcg = READ_ONCE(current->objcg); - if (unlikely((unsigned long)objcg & CURRENT_OBJCG_UPDATE_FLAG)) - objcg = current_objcg_update(); - - if (objcg) { - obj_cgroup_get(objcg); - return objcg; - } - } else { - memcg = this_cpu_read(int_active_memcg); - if (unlikely(memcg)) - goto from_memcg; - } - return NULL; - -from_memcg: - rcu_read_lock(); - objcg = __get_obj_cgroup_from_memcg(memcg); - rcu_read_unlock(); - return objcg; -} - __always_inline struct obj_cgroup *current_obj_cgroup(void) { struct mem_cgroup *memcg; -- cgit v1.2.3 From e5b306a082982cb98bc7ec48a382225522401a61 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 17 Oct 2023 09:17:28 +0800 Subject: mm/swap: avoid a xa load for swapout path A variable is never used for swapout path (shadowp is NULL) and compiler is unable to optimize out the unneeded load since it's a function call. The was introduced by 3852f6768ede ("mm/swapcache: support to handle the shadow entries"). Link: https://lkml.kernel.org/r/20231017011728.37508-1-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Matthew Wilcox (Oracle) Cc: Huang Ying Signed-off-by: Andrew Morton --- mm/swap_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index b3b14bd0dd64..ab79ffb71736 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -109,9 +109,9 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); - old = xas_load(&xas); - if (xa_is_value(old)) { - if (shadowp) + if (shadowp) { + old = xas_load(&xas); + if (xa_is_value(old)) *shadowp = old; } xas_store(&xas, folio); -- cgit v1.2.3 From 1d44f2e6d178163a94980fd5f9a4b04b6b36535b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:48 +0800 Subject: mm_types: add virtual and _last_cpupid into struct folio Patch series "mm: convert page cpupid functions to folios", v3. The cpupid(or access time) used by numa balancing is stored in flags or _last_cpupid(if LAST_CPUPID_NOT_IN_PAGE_FLAGS) of page, this is to convert page cpupid to folio cpupid, a new _last_cpupid is added into folio, which make us to use folio->_last_cpupid directly, and the page cpupid functions are converted to folio ones. page_cpupid_last() -> folio_last_cpupid() xchg_page_access_time() -> folio_xchg_access_time() page_cpupid_xchg_last() -> folio_xchg_last_cpupid() This patch (of 19): If WANT_PAGE_VIRTUAL and LAST_CPUPID_NOT_IN_PAGE_FLAGS defined, the 'virtual' and '_last_cpupid' are in struct page, and since _last_cpupid is used by numa balancing feature, it is better to move it before KMSAN metadata from struct page, also add them into struct folio to make us to access them from folio directly. Link: https://lkml.kernel.org/r/20231018140806.2783514-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20231018140806.2783514-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21eb56145f57..692e41213cd3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -199,6 +199,10 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + int _last_cpupid; +#endif + #ifdef CONFIG_KMSAN /* * KMSAN metadata for this page: @@ -210,10 +214,6 @@ struct page { struct page *kmsan_shadow; struct page *kmsan_origin; #endif - -#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS - int _last_cpupid; -#endif } _struct_page_alignment; /* @@ -272,6 +272,8 @@ typedef struct { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @virtual: Virtual address in the kernel direct map. + * @_last_cpupid: IDs of last CPU and last process that accessed the folio. * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). @@ -317,6 +319,12 @@ struct folio { atomic_t _refcount; #ifdef CONFIG_MEMCG unsigned long memcg_data; +#endif +#if defined(WANT_PAGE_VIRTUAL) + void *virtual; +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS + int _last_cpupid; #endif /* private: the union with struct page is transitional */ }; @@ -373,6 +381,12 @@ FOLIO_MATCH(_refcount, _refcount); #ifdef CONFIG_MEMCG FOLIO_MATCH(memcg_data, memcg_data); #endif +#if defined(WANT_PAGE_VIRTUAL) +FOLIO_MATCH(virtual, virtual); +#endif +#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +FOLIO_MATCH(_last_cpupid, _last_cpupid); +#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ -- cgit v1.2.3 From 155c98cfcf961327adedabd629edfc2301cf354b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:49 +0800 Subject: mm: add folio_last_cpupid() Add folio_last_cpupid() wrapper, which is required to convert page_cpupid_last() to folio vertion later in the series. Link: https://lkml.kernel.org/r/20231018140806.2783514-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 86e040e886e4..0c4ff852930b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ +static inline int folio_last_cpupid(struct folio *folio) +{ + return page_cpupid_last(&folio->page); +} + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* -- cgit v1.2.3 From 67b33e3ff58374b3fca929933ccc04a1858fda6a Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:50 +0800 Subject: mm: memory: use folio_last_cpupid() in do_numa_page() Convert to use folio_last_cpupid() in do_numa_page(). Link: https://lkml.kernel.org/r/20231018140806.2783514-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 1f88e4f6fbf2..0915eac772fe 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4858,7 +4858,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) !node_is_toptier(nid)) last_cpupid = (-1 & LAST_CPUPID_MASK); else - last_cpupid = page_cpupid_last(&folio->page); + last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, vmf->address, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); -- cgit v1.2.3 From c4a8d2faab1f9165df1543795254b1c2470ce7f8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:51 +0800 Subject: mm: huge_memory: use folio_last_cpupid() in do_huge_pmd_numa_page() Convert to use folio_last_cpupid() in do_huge_pmd_numa_page(). Link: https://lkml.kernel.org/r/20231018140806.2783514-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c9cbcbf6697e..f9571bf92603 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1562,7 +1562,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) * to record page access time. So use default value. */ if (node_is_toptier(nid)) - last_cpupid = page_cpupid_last(&folio->page); + last_cpupid = folio_last_cpupid(folio); target_nid = numa_migrate_prep(folio, vma, haddr, nid, &flags); if (target_nid == NUMA_NO_NODE) { folio_put(folio); -- cgit v1.2.3 From 19c1ac02ce02158fa22eb53f2750525ae93da9ef Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:52 +0800 Subject: mm: huge_memory: use folio_last_cpupid() in __split_huge_page_tail() Convert to use folio_last_cpupid() in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231018140806.2783514-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f9571bf92603..5455dfe4c3c7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2514,7 +2514,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + page_cpupid_xchg_last(page_tail, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new -- cgit v1.2.3 From f39eac30a8f334f0765ef78fe4d13b3fd5bfa3fd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:53 +0800 Subject: mm: remove page_cpupid_last() Since all calls use folio_last_cpupid(), remove page_cpupid_last(). Link: https://lkml.kernel.org/r/20231018140806.2783514-7-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0c4ff852930b..f8b7a1825720 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1702,18 +1702,18 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); } -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio) { - return page->_last_cpupid; + return folio->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio) { - return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; + return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } extern int page_cpupid_xchg_last(struct page *page, int cpupid); @@ -1752,9 +1752,9 @@ static inline int xchg_page_access_time(struct page *page, int time) return 0; } -static inline int page_cpupid_last(struct page *page) +static inline int folio_last_cpupid(struct folio *folio) { - return page_to_nid(page); /* XXX */ + return folio_nid(folio); /* XXX */ } static inline int cpupid_to_nid(int cpupid) @@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ -static inline int folio_last_cpupid(struct folio *folio) -{ - return page_cpupid_last(&folio->page); -} - #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* -- cgit v1.2.3 From 55c199385c4465e9abe1a3d6d1aba348d0356e03 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:54 +0800 Subject: mm: add folio_xchg_access_time() Add folio_xchg_access_time() wrapper, which is required to convert xchg_page_access_time() to folio vertion later in the series. Link: https://lkml.kernel.org/r/20231018140806.2783514-8-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index f8b7a1825720..dee63b820f0c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1796,6 +1796,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ +static inline int folio_xchg_access_time(struct folio *folio, int time) +{ + return xchg_page_access_time(&folio->page, time); +} + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* -- cgit v1.2.3 From 0b201c3624ae9f58ebfff8484f304f3008fb01b8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:55 +0800 Subject: sched/fair: use folio_xchg_access_time() in numa_hint_fault_latency() Convert to use folio_xchg_access_time() in numa_hint_fault_latency(). Link: https://lkml.kernel.org/r/20231018140806.2783514-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 42aefe7e6fdc..3ee9d3564c20 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1727,7 +1727,7 @@ static int numa_hint_fault_latency(struct folio *folio) int last_time, time; time = jiffies_to_msecs(jiffies); - last_time = xchg_page_access_time(&folio->page, time); + last_time = folio_xchg_access_time(folio, time); return (time - last_time) & PAGE_ACCESS_TIME_MASK; } -- cgit v1.2.3 From ec1778807a8053d14cde7cfd75fbd66e0c7b9c9f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:56 +0800 Subject: mm: mprotect: use a folio in change_pte_range() Use a folio in change_pte_range() to save three compound_head() calls. Since now only normal and PMD-mapped page is handled by numa balancing, it is enough to only update the entire folio's access time. Link: https://lkml.kernel.org/r/20231018140806.2783514-10-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/mprotect.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index f1dc8f8c84ef..81991102f785 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -114,7 +114,7 @@ static long change_pte_range(struct mmu_gather *tlb, * pages. See similar comment in change_huge_pmd. */ if (prot_numa) { - struct page *page; + struct folio *folio; int nid; bool toptier; @@ -122,13 +122,14 @@ static long change_pte_range(struct mmu_gather *tlb, if (pte_protnone(oldpte)) continue; - page = vm_normal_page(vma, addr, oldpte); - if (!page || is_zone_device_page(page) || PageKsm(page)) + folio = vm_normal_folio(vma, addr, oldpte); + if (!folio || folio_is_zone_device(folio) || + folio_test_ksm(folio)) continue; /* Also skip shared copy-on-write pages */ if (is_cow_mapping(vma->vm_flags) && - page_count(page) != 1) + folio_ref_count(folio) != 1) continue; /* @@ -136,14 +137,15 @@ static long change_pte_range(struct mmu_gather *tlb, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_lru(page) && PageDirty(page)) + if (folio_is_file_lru(folio) && + folio_test_dirty(folio)) continue; /* * Don't mess with PTEs if page is already on the node * a single-threaded process is running on. */ - nid = page_to_nid(page); + nid = folio_nid(folio); if (target_node == nid) continue; toptier = node_is_toptier(nid); @@ -157,7 +159,7 @@ static long change_pte_range(struct mmu_gather *tlb, continue; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !toptier) - xchg_page_access_time(page, + folio_xchg_access_time(folio, jiffies_to_msecs(jiffies)); } -- cgit v1.2.3 From d986ba2b1953f761d3859c22160e82c58ed4287d Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:57 +0800 Subject: mm: huge_memory: use a folio in change_huge_pmd() Use a folio in change_huge_pmd(), which helps to remove last xchg_page_access_time() caller. Link: https://lkml.kernel.org/r/20231018140806.2783514-11-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5455dfe4c3c7..f01f345141da 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1856,7 +1856,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct page *page = pfn_swap_entry_to_page(entry); + struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); @@ -1865,7 +1865,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * A protection check is difficult so * just be safe and disable write */ - if (PageAnon(page)) + if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry(swp_offset(entry)); else entry = make_readable_migration_entry(swp_offset(entry)); @@ -1887,7 +1887,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #endif if (prot_numa) { - struct page *page; + struct folio *folio; bool toptier; /* * Avoid trapping faults against the zero page. The read-only @@ -1900,8 +1900,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_protnone(*pmd)) goto unlock; - page = pmd_page(*pmd); - toptier = node_is_toptier(page_to_nid(page)); + folio = page_folio(pmd_page(*pmd)); + toptier = node_is_toptier(folio_nid(folio)); /* * Skip scanning top tier node if normal numa * balancing is disabled @@ -1912,7 +1912,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && !toptier) - xchg_page_access_time(page, jiffies_to_msecs(jiffies)); + folio_xchg_access_time(folio, + jiffies_to_msecs(jiffies)); } /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical -- cgit v1.2.3 From f393084382fa3bbd5840b428d538dbcb33be0186 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:58 +0800 Subject: mm: remove xchg_page_access_time() Since all calls use folio_xchg_access_time(), remove xchg_page_access_time(). Link: https://lkml.kernel.org/r/20231018140806.2783514-12-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dee63b820f0c..e003465238c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1724,11 +1724,12 @@ static inline void page_cpupid_reset_last(struct page *page) } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ -static inline int xchg_page_access_time(struct page *page, int time) +static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; - last_time = page_cpupid_xchg_last(page, time >> PAGE_ACCESS_TIME_BUCKETS); + last_time = page_cpupid_xchg_last(&folio->page, + time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } @@ -1747,7 +1748,7 @@ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) return page_to_nid(page); /* XXX */ } -static inline int xchg_page_access_time(struct page *page, int time) +static inline int folio_xchg_access_time(struct folio *folio, int time) { return 0; } @@ -1796,11 +1797,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ -static inline int folio_xchg_access_time(struct folio *folio, int time) -{ - return xchg_page_access_time(&folio->page, time); -} - #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* -- cgit v1.2.3 From 136d0b47576f8701d68c2d504e7237d9fdc4ebbd Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:07:59 +0800 Subject: mm: add folio_xchg_last_cpupid() Add folio_xchg_last_cpupid() wrapper, which is required to convert page_cpupid_xchg_last() to folio vertion later in the series. Link: https://lkml.kernel.org/r/20231018140806.2783514-13-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index e003465238c4..00e0874f7ce2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1797,6 +1797,11 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) +{ + return page_cpupid_xchg_last(&folio->page, cpupid); +} + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* -- cgit v1.2.3 From 1b143cc77f2074dd43b610d6bfffc822d20b6e16 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:00 +0800 Subject: sched/fair: use folio_xchg_last_cpupid() in should_numa_migrate_memory() Convert to use folio_xchg_last_cpupid() in should_numa_migrate_memory(). Link: https://lkml.kernel.org/r/20231018140806.2783514-14-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3ee9d3564c20..d1a765cdf6e4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1823,7 +1823,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, } this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); - last_cpupid = page_cpupid_xchg_last(&folio->page, this_cpupid); + last_cpupid = folio_xchg_last_cpupid(folio, this_cpupid); if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) -- cgit v1.2.3 From 4e694fe4d2fa3031392bdbeaa88066f67c886a0c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:01 +0800 Subject: mm: migrate: use folio_xchg_last_cpupid() in folio_migrate_flags() Convert to use folio_xchg_last_cpupid() in folio_migrate_flags(), also directly use folio_nid() instead of page_to_nid(&folio->page). Link: https://lkml.kernel.org/r/20231018140806.2783514-15-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/migrate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index c602bf6dec97..71fc5064cec6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -588,20 +588,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. */ - cpupid = page_cpupid_xchg_last(&folio->page, -1); + cpupid = folio_xchg_last_cpupid(folio, -1); /* * For memory tiering mode, when migrate between slow and fast * memory node, reset cpupid, because that is used to record * page access time in slow memory node. */ if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) { - bool f_toptier = node_is_toptier(page_to_nid(&folio->page)); - bool t_toptier = node_is_toptier(page_to_nid(&newfolio->page)); + bool f_toptier = node_is_toptier(folio_nid(folio)); + bool t_toptier = node_is_toptier(folio_nid(newfolio)); if (f_toptier != t_toptier) cpupid = -1; } - page_cpupid_xchg_last(&newfolio->page, cpupid); + folio_xchg_last_cpupid(newfolio, cpupid); folio_migrate_ksm(newfolio, folio); /* -- cgit v1.2.3 From c82530113480f8db9dd9584c51ec9326e6ce9790 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:02 +0800 Subject: mm: huge_memory: use folio_xchg_last_cpupid() in __split_huge_page_tail() Convert to use folio_xchg_last_cpupid() in __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20231018140806.2783514-16-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f01f345141da..f31f02472396 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2515,7 +2515,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail, if (page_is_idle(head)) set_page_idle(page_tail); - page_cpupid_xchg_last(page_tail, folio_last_cpupid(folio)); + folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio)); /* * always add to the tail because some iterators expect new -- cgit v1.2.3 From c08b7e3830dbf24dd2552ddeea84f00393842f1b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:03 +0800 Subject: mm: make finish_mkwrite_fault() static Make finish_mkwrite_fault static since it is not used outside of memory.c. Link: https://lkml.kernel.org/r/20231018140806.2783514-17-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/memory.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 00e0874f7ce2..5d6e1d7656c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1348,7 +1348,6 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio, struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* diff --git a/mm/memory.c b/mm/memory.c index 0915eac772fe..2b6e2b239411 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3269,7 +3269,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, -- cgit v1.2.3 From a86bc96b77df40c27ead5ef4ac3837904b7eb53f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:04 +0800 Subject: mm: convert wp_page_reuse() and finish_mkwrite_fault() to take a folio Saves one compound_head() call, also in preparation for page_cpupid_xchg_last() conversion. Link: https://lkml.kernel.org/r/20231018140806.2783514-18-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2b6e2b239411..94791c5e2951 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3015,7 +3015,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline void wp_page_reuse(struct vm_fault *vmf) +static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -3023,7 +3023,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page)); + VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); /* * Clear the pages cpupid information as the existing @@ -3258,6 +3258,7 @@ out: * writeable once the page is prepared * * @vmf: structure describing the fault + * @folio: the folio of vmf->page * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. @@ -3269,7 +3270,7 @@ out: * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before * we acquired PTE lock. */ -static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) +static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, @@ -3285,7 +3286,7 @@ static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } @@ -3309,9 +3310,9 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; - return finish_mkwrite_fault(vmf); + return finish_mkwrite_fault(vmf, NULL); } - wp_page_reuse(vmf); + wp_page_reuse(vmf, NULL); return 0; } @@ -3339,14 +3340,14 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) folio_put(folio); return tmp; } - tmp = finish_mkwrite_fault(vmf); + tmp = finish_mkwrite_fault(vmf, folio); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { folio_unlock(folio); folio_put(folio); return tmp; } } else { - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); @@ -3491,7 +3492,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } - wp_page_reuse(vmf); + wp_page_reuse(vmf, folio); return 0; } /* -- cgit v1.2.3 From c2c3b5148052cef670d359b81d338d20b96bf47f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:05 +0800 Subject: mm: use folio_xchg_last_cpupid() in wp_page_reuse() Convert to use folio_xchg_last_cpupid() in wp_page_reuse(), and remove page variable. Since now only normal and PMD-mapped page is handled by numa balancing, it's enough to only update the entire folio's last cpupid. Link: https://lkml.kernel.org/r/20231018140806.2783514-19-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memory.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 94791c5e2951..1f18ed4a5497 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3019,19 +3019,20 @@ static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - struct page *page = vmf->page; pte_t entry; VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(folio && folio_test_anon(folio) && !PageAnonExclusive(page)); - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (page) - page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); + if (folio) { + VM_BUG_ON(folio_test_anon(folio) && + !PageAnonExclusive(vmf->page)); + /* + * Clear the folio's cpupid information as the existing + * information potentially belongs to a now completely + * unrelated process. + */ + folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1); + } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); -- cgit v1.2.3 From 8f0f4788b1247c2f92ecacd8f86ce0b379b807b9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 18 Oct 2023 22:08:06 +0800 Subject: mm: remove page_cpupid_xchg_last() Since all calls use folio_xchg_last_cpupid(), remove page_cpupid_xchg_last(). Link: https://lkml.kernel.org/r/20231018140806.2783514-20-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Ingo Molnar Cc: Juri Lelli Cc: Matthew Wilcox (Oracle) Cc: Peter Zijlstra Cc: Vincent Guittot Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 19 +++++++------------ mm/mmzone.c | 6 +++--- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d6e1d7656c2..0e496a8b8e50 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1696,9 +1696,9 @@ static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { - return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); + return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int folio_last_cpupid(struct folio *folio) @@ -1715,7 +1715,7 @@ static inline int folio_last_cpupid(struct folio *folio) return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } -extern int page_cpupid_xchg_last(struct page *page, int cpupid); +int folio_xchg_last_cpupid(struct folio *folio, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { @@ -1727,8 +1727,8 @@ static inline int folio_xchg_access_time(struct folio *folio, int time) { int last_time; - last_time = page_cpupid_xchg_last(&folio->page, - time >> PAGE_ACCESS_TIME_BUCKETS); + last_time = folio_xchg_last_cpupid(folio, + time >> PAGE_ACCESS_TIME_BUCKETS); return last_time << PAGE_ACCESS_TIME_BUCKETS; } @@ -1742,9 +1742,9 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } } #else /* !CONFIG_NUMA_BALANCING */ -static inline int page_cpupid_xchg_last(struct page *page, int cpupid) +static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { - return page_to_nid(page); /* XXX */ + return folio_nid(folio); /* XXX */ } static inline int folio_xchg_access_time(struct folio *folio, int time) @@ -1796,11 +1796,6 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) } #endif /* CONFIG_NUMA_BALANCING */ -static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid) -{ - return page_cpupid_xchg_last(&folio->page, cpupid); -} - #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) /* diff --git a/mm/mmzone.c b/mm/mmzone.c index 68e1511be12d..b594d3f268fe 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -93,19 +93,19 @@ void lruvec_init(struct lruvec *lruvec) } #if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) -int page_cpupid_xchg_last(struct page *page, int cpupid) +int folio_xchg_last_cpupid(struct folio *folio, int cpupid) { unsigned long old_flags, flags; int last_cpupid; - old_flags = READ_ONCE(page->flags); + old_flags = READ_ONCE(folio->flags); do { flags = old_flags; last_cpupid = (flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; - } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } while (unlikely(!try_cmpxchg(&folio->flags, &old_flags, flags))); return last_cpupid; } -- cgit v1.2.3 From 6d4e2cda62af38f7c21052b6847ffff08ea7173f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:46 +0800 Subject: bootmem: use kmemleak_free_part_phys in put_page_bootmem Patch series "Some bugfix about kmemleak", v3. Some bugfixes for kmemleak and the printed info from debug mode. This patch (of 7): Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to delete kmemleak object in put_page_bootmem(). In debug mode, there are following warning: kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096) Link: https://lkml.kernel.org/r/20231018102952.3339837-1-liushixin2@huawei.com Link: https://lkml.kernel.org/r/20231018102952.3339837-2-liushixin2@huawei.com Fixes: dd0ff4d12dd2 ("bootmem: remove the vmemmap pages from kmemleak in put_page_bootmem") Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/bootmem_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index b1efebfcf94b..fa7cb0c87c03 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -34,7 +34,7 @@ void put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - kmemleak_free_part(page_to_virt(page), PAGE_SIZE); + kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } } -- cgit v1.2.3 From 80203f1ca086835100843f1474bd6dd4a48cc73b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:47 +0800 Subject: bootmem: use kmemleak_free_part_phys in free_bootmem_page Since kmemleak_alloc_phys() rather than kmemleak_alloc() was called from memblock_alloc_range_nid(), kmemleak_free_part_phys() should be used to delete kmemleak object in free_bootmem_page(). In debug mode, there are following warning: kmemleak: Partially freeing unknown object at 0xffff97345aff7000 (size 4096) Link: https://lkml.kernel.org/r/20231018102952.3339837-3-liushixin2@huawei.com Fixes: 028725e73375 ("bootmem: remove the vmemmap pages from kmemleak in free_bootmem_page") Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- include/linux/bootmem_info.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index e1a3c9c9754c..cffa38a73618 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -60,7 +60,7 @@ static inline void get_page_bootmem(unsigned long info, struct page *page, static inline void free_bootmem_page(struct page *page) { - kmemleak_free_part(page_to_virt(page), PAGE_SIZE); + kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } #endif -- cgit v1.2.3 From 62047e0f3e3a707599afe6d43cf684a2a02d05d5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:48 +0800 Subject: mm/kmemleak: fix print format of pointer in pr_debug() With 0x%p, the pointer will be hashed and print (____ptrval____) instead. And with 0x%pa, the pointer can be successfully printed but with duplicate prefixes, which looks like: kmemleak: kmemleak_free(0x(____ptrval____)) kmemleak: kmemleak_free_percpu(0x(____ptrval____)) kmemleak: kmemleak_free_part_phys(0x0x0000000a1af86000) Use 0x%px instead of 0x%p or 0x%pa to print the pointer. Then the print will be like: kmemleak: kmemleak_free(0xffff9111c145b020) kmemleak: kmemleak_free_percpu(0x00000000000333b0) kmemleak: kmemleak_free_part_phys(0x0000000a1af80000) Link: https://lkml.kernel.org/r/20231018102952.3339837-4-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 54c2c90d3abc..289b3be5ee6e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -975,7 +975,7 @@ static void object_no_scan(unsigned long ptr) void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) { - pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); + pr_debug("%s(0x%px, %zu, %d)\n", __func__, ptr, size, min_count); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); @@ -996,7 +996,7 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, { unsigned int cpu; - pr_debug("%s(0x%p, %zu)\n", __func__, ptr, size); + pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size); /* * Percpu allocations are only scanned and not reported as leaks @@ -1020,7 +1020,7 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); */ void __ref kmemleak_vmalloc(const struct vm_struct *area, size_t size, gfp_t gfp) { - pr_debug("%s(0x%p, %zu)\n", __func__, area, size); + pr_debug("%s(0x%px, %zu)\n", __func__, area, size); /* * A min_count = 2 is needed because vm_struct contains a reference to @@ -1043,7 +1043,7 @@ EXPORT_SYMBOL_GPL(kmemleak_vmalloc); */ void __ref kmemleak_free(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); @@ -1061,7 +1061,7 @@ EXPORT_SYMBOL_GPL(kmemleak_free); */ void __ref kmemleak_free_part(const void *ptr, size_t size) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size, false); @@ -1079,7 +1079,7 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) { unsigned int cpu; - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) @@ -1100,7 +1100,7 @@ void __ref kmemleak_update_trace(const void *ptr) struct kmemleak_object *object; unsigned long flags; - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (!kmemleak_enabled || IS_ERR_OR_NULL(ptr)) return; @@ -1131,7 +1131,7 @@ EXPORT_SYMBOL(kmemleak_update_trace); */ void __ref kmemleak_not_leak(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); @@ -1149,7 +1149,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); */ void __ref kmemleak_ignore(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr, false); @@ -1169,7 +1169,7 @@ EXPORT_SYMBOL(kmemleak_ignore); */ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); @@ -1187,7 +1187,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); */ void __ref kmemleak_no_scan(const void *ptr) { - pr_debug("%s(0x%p)\n", __func__, ptr); + pr_debug("%s(0x%px)\n", __func__, ptr); if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); @@ -1203,7 +1203,7 @@ EXPORT_SYMBOL(kmemleak_no_scan); */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp) { - pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size); + pr_debug("%s(0x%px, %zu)\n", __func__, &phys, size); if (kmemleak_enabled) /* @@ -1223,7 +1223,7 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { - pr_debug("%s(0x%pa)\n", __func__, &phys); + pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) delete_object_part((unsigned long)phys, size, true); @@ -1237,7 +1237,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { - pr_debug("%s(0x%pa)\n", __func__, &phys); + pr_debug("%s(0x%px)\n", __func__, &phys); if (kmemleak_enabled) make_black_object((unsigned long)phys, true); -- cgit v1.2.3 From 0edd7b58293340d26380d7510fd4fc08e5902511 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:49 +0800 Subject: mm: kmemleak: split __create_object into two functions __create_object() consists of two part, the first part allocate a kmemleak object and initialize it, the second part insert it into object tree. This function need kmemleak_lock but actually only the second part need lock. Split it into two functions, the first function __alloc_object only allocate a kmemleak object, and the second function __link_object() will initialize the object and insert it into object tree, use the kmemleak_lock to protect __link_object() only. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20231018102952.3339837-5-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 61 +++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 289b3be5ee6e..39732dcaf45b 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -623,25 +623,15 @@ static noinline depot_stack_handle_t set_track_prepare(void) return trace_handle; } -/* - * Create the metadata (struct kmemleak_object) corresponding to an allocated - * memory block and add it to the object_list and object_tree_root (or - * object_phys_tree_root). - */ -static void __create_object(unsigned long ptr, size_t size, - int min_count, gfp_t gfp, bool is_phys) +static struct kmemleak_object *__alloc_object(gfp_t gfp) { - unsigned long flags; - struct kmemleak_object *object, *parent; - struct rb_node **link, *rb_parent; - unsigned long untagged_ptr; - unsigned long untagged_objp; + struct kmemleak_object *object; object = mem_pool_alloc(gfp); if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return; + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -649,13 +639,8 @@ static void __create_object(unsigned long ptr, size_t size, INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); - object->pointer = ptr; - object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; - object->min_count = min_count; object->count = 0; /* white color initially */ - object->jiffies = jiffies; object->checksum = 0; object->del_state = 0; @@ -680,7 +665,23 @@ static void __create_object(unsigned long ptr, size_t size, /* kernel backtrace */ object->trace_handle = set_track_prepare(); - raw_spin_lock_irqsave(&kmemleak_lock, flags); + return object; +} + +static void __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) +{ + + struct kmemleak_object *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; + object->min_count = min_count; + object->jiffies = jiffies; untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* @@ -711,14 +712,32 @@ static void __create_object(unsigned long ptr, size_t size, */ dump_object_info(parent); kmem_cache_free(object_cache, object); - goto out; + return; } } rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); list_add_tail_rcu(&object->object_list, &object_list); -out: +} + +/* + * Create the metadata (struct kmemleak_object) corresponding to an allocated + * memory block and add it to the object_list and object_tree_root (or + * object_phys_tree_root). + */ +static void __create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp, bool is_phys) +{ + struct kmemleak_object *object; + unsigned long flags; + + object = __alloc_object(gfp); + if (!object) + return; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + __link_object(object, ptr, size, min_count, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); } -- cgit v1.2.3 From 2e1d47385f98aa0fa7d71aeee32d26cc6f8493e0 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:50 +0800 Subject: mm: kmemleak: use mem_pool_free() to free object The kmemleak object is allocated by mem_pool_alloc(), which could be from slab or mem_pool[], so it's not suitable using __kmem_cache_free() to free the object, use __mem_pool_free() instead. Link: https://lkml.kernel.org/r/20231018102952.3339837-6-liushixin2@huawei.com Fixes: 0647398a8c7b ("mm: kmemleak: simple memory allocation pool for kmemleak objects") Signed-off-by: Liu Shixin Reviewed-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 39732dcaf45b..82322b029f1c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -668,8 +668,8 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) return object; } -static void __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) +static int __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) { struct kmemleak_object *parent; @@ -711,14 +711,15 @@ static void __link_object(struct kmemleak_object *object, unsigned long ptr, * be freed while the kmemleak_lock is held. */ dump_object_info(parent); - kmem_cache_free(object_cache, object); - return; + return -EEXIST; } } rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); list_add_tail_rcu(&object->object_list, &object_list); + + return 0; } /* @@ -731,14 +732,17 @@ static void __create_object(unsigned long ptr, size_t size, { struct kmemleak_object *object; unsigned long flags; + int ret; object = __alloc_object(gfp); if (!object) return; raw_spin_lock_irqsave(&kmemleak_lock, flags); - __link_object(object, ptr, size, min_count, is_phys); + ret = __link_object(object, ptr, size, min_count, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + if (ret) + mem_pool_free(object); } /* Create kmemleak object which allocated with virtual address. */ -- cgit v1.2.3 From 858a195b9330d0bd36f59e8652f693a1beb7da2f Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:51 +0800 Subject: mm: kmemleak: add __find_and_remove_object() Add new __find_and_remove_object() without kmemleak_lock protect, it is in preparation for the next patch. Link: https://lkml.kernel.org/r/20231018102952.3339837-7-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 82322b029f1c..f63c220eb49e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -583,6 +583,19 @@ static void __remove_object(struct kmemleak_object *object) object->del_state |= DELSTATE_REMOVED; } +static struct kmemleak_object *__find_and_remove_object(unsigned long ptr, + int alias, + bool is_phys) +{ + struct kmemleak_object *object; + + object = __lookup_object(ptr, alias, is_phys); + if (object) + __remove_object(object); + + return object; +} + /* * Look up an object in the object search tree and remove it from both * object_tree_root (or object_phys_tree_root) and object_list. The @@ -596,9 +609,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali struct kmemleak_object *object; raw_spin_lock_irqsave(&kmemleak_lock, flags); - object = __lookup_object(ptr, alias, is_phys); - if (object) - __remove_object(object); + object = __find_and_remove_object(ptr, alias, is_phys); raw_spin_unlock_irqrestore(&kmemleak_lock, flags); return object; -- cgit v1.2.3 From 5e4fc577db2514fee3cc2729415d0e2589d5d056 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Wed, 18 Oct 2023 18:29:52 +0800 Subject: mm/kmemleak: fix partially freeing unknown object warning delete_object_part() can be called by multiple callers in the same time. If an object is found and removed by a caller, and then another caller try to find it too, it failed and return directly. It still be recorded by kmemleak even if it has already been freed to buddy. With DEBUG on, kmemleak will report the following warning, kmemleak: Partially freeing unknown object at 0xa1af86000 (size 4096) CPU: 0 PID: 742 Comm: test_huge Not tainted 6.6.0-rc3kmemleak+ #54 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x37/0x50 kmemleak_free_part_phys+0x50/0x60 hugetlb_vmemmap_optimize+0x172/0x290 ? __pfx_vmemmap_remap_pte+0x10/0x10 __prep_new_hugetlb_folio+0xe/0x30 prep_new_hugetlb_folio.isra.0+0xe/0x40 alloc_fresh_hugetlb_folio+0xc3/0xd0 alloc_surplus_hugetlb_folio.constprop.0+0x6e/0xd0 hugetlb_acct_memory.part.0+0xe6/0x2a0 hugetlb_reserve_pages+0x110/0x2c0 hugetlbfs_file_mmap+0x11d/0x1b0 mmap_region+0x248/0x9a0 ? hugetlb_get_unmapped_area+0x15c/0x2d0 do_mmap+0x38b/0x580 vm_mmap_pgoff+0xe6/0x190 ksys_mmap_pgoff+0x18a/0x1f0 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Expand __create_object() and move __alloc_object() to the beginning. Then use kmemleak_lock to protect __find_and_remove_object() and __link_object() as a whole, which can guarantee all objects are processed sequentialally. Link: https://lkml.kernel.org/r/20231018102952.3339837-8-liushixin2@huawei.com Fixes: 53238a60dd4a ("kmemleak: Allow partial freeing of memory blocks") Signed-off-by: Liu Shixin Reviewed-by: Catalin Marinas Cc: Kefeng Wang Cc: Patrick Wang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f63c220eb49e..22bab3738a9e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -816,16 +816,25 @@ static void delete_object_full(unsigned long ptr) */ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) { - struct kmemleak_object *object; - unsigned long start, end; + struct kmemleak_object *object, *object_l, *object_r; + unsigned long start, end, flags; + + object_l = __alloc_object(GFP_KERNEL); + if (!object_l) + return; - object = find_and_remove_object(ptr, 1, is_phys); + object_r = __alloc_object(GFP_KERNEL); + if (!object_r) + goto out; + + raw_spin_lock_irqsave(&kmemleak_lock, flags); + object = __find_and_remove_object(ptr, 1, is_phys); if (!object) { #ifdef DEBUG kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", ptr, size); #endif - return; + goto unlock; } /* @@ -835,14 +844,25 @@ static void delete_object_part(unsigned long ptr, size_t size, bool is_phys) */ start = object->pointer; end = object->pointer + object->size; - if (ptr > start) - __create_object(start, ptr - start, object->min_count, - GFP_KERNEL, is_phys); - if (ptr + size < end) - __create_object(ptr + size, end - ptr - size, object->min_count, - GFP_KERNEL, is_phys); + if ((ptr > start) && + !__link_object(object_l, start, ptr - start, + object->min_count, is_phys)) + object_l = NULL; + if ((ptr + size < end) && + !__link_object(object_r, ptr + size, end - ptr - size, + object->min_count, is_phys)) + object_r = NULL; + +unlock: + raw_spin_unlock_irqrestore(&kmemleak_lock, flags); + if (object) + __delete_object(object); - __delete_object(object); +out: + if (object_l) + mem_pool_free(object_l); + if (object_r) + mem_pool_free(object_r); } static void __paint_it(struct kmemleak_object *object, int color) -- cgit v1.2.3 From 245245c2fffd0050772a3f30ba50e2be92537a32 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Mon, 23 Oct 2023 10:51:25 +0800 Subject: mm/kmemleak: move the initialisation of object to __link_object In patch (mm: kmemleak: split __create_object into two functions), the initialisation of object has been splited in two places. Catalin said it feels a bit weird and error prone. So leave __alloc_object() to just do the actual allocation and let __link_object() do the full initialisation. Link: https://lkml.kernel.org/r/20231023025125.90972-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Suggested-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 22bab3738a9e..1eacca03bedd 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -642,16 +642,32 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) if (!object) { pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); - return NULL; } + return object; +} + +static int __link_object(struct kmemleak_object *object, unsigned long ptr, + size_t size, int min_count, bool is_phys) +{ + + struct kmemleak_object *parent; + struct rb_node **link, *rb_parent; + unsigned long untagged_ptr; + unsigned long untagged_objp; + INIT_LIST_HEAD(&object->object_list); INIT_LIST_HEAD(&object->gray_list); INIT_HLIST_HEAD(&object->area_list); raw_spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); + object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); + object->pointer = ptr; + object->size = kfence_ksize((void *)ptr) ?: size; object->excess_ref = 0; + object->min_count = min_count; object->count = 0; /* white color initially */ + object->jiffies = jiffies; object->checksum = 0; object->del_state = 0; @@ -676,24 +692,6 @@ static struct kmemleak_object *__alloc_object(gfp_t gfp) /* kernel backtrace */ object->trace_handle = set_track_prepare(); - return object; -} - -static int __link_object(struct kmemleak_object *object, unsigned long ptr, - size_t size, int min_count, bool is_phys) -{ - - struct kmemleak_object *parent; - struct rb_node **link, *rb_parent; - unsigned long untagged_ptr; - unsigned long untagged_objp; - - object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0); - object->pointer = ptr; - object->size = kfence_ksize((void *)ptr) ?: size; - object->min_count = min_count; - object->jiffies = jiffies; - untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); /* * Only update min_addr and max_addr with object -- cgit v1.2.3 From a259945efe6ada94087ef666e9b38f8e34ea34ba Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 17 Oct 2023 12:31:28 -0400 Subject: mm/migrate: correct nr_failed in migrate_pages_sync() nr_failed was missing the large folio splits from migrate_pages_batch() and can cause a mismatch between migrate_pages() return value and the number of not migrated pages, i.e., when the return value of migrate_pages() is 0, there are still pages left in the from page list. It will happen when a non-PMD THP large folio fails to migrate due to -ENOMEM and is split successfully but not all the split pages are not migrated, migrate_pages_batch() would return non-zero, but astats.nr_thp_split = 0. nr_failed would be 0 and returned to the caller of migrate_pages(), but the not migrated pages are left in the from page list without being added back to LRU lists. Fix it by adding a new nr_split counter for large folio splits and adding it to nr_failed in migrate_page_sync() after migrate_pages_batch() is done. Link: https://lkml.kernel.org/r/20231017163129.2025214-1-zi.yan@sent.com Fixes: 2ef7dbb26990 ("migrate_pages: try migrate in batch asynchronously firstly") Signed-off-by: Zi Yan Acked-by: Huang Ying Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/migrate.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 71fc5064cec6..53244d283c3e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1495,6 +1495,7 @@ struct migrate_pages_stats { int nr_thp_succeeded; /* THP migrated successfully */ int nr_thp_failed; /* THP failed to be migrated */ int nr_thp_split; /* THP split before migrating */ + int nr_split; /* Large folio (include THP) split before migrating */ }; /* @@ -1614,6 +1615,7 @@ static int migrate_pages_batch(struct list_head *from, int nr_retry_pages = 0; int pass = 0; bool is_thp = false; + bool is_large = false; struct folio *folio, *folio2, *dst = NULL, *dst2; int rc, rc_saved = 0, nr_pages; LIST_HEAD(unmap_folios); @@ -1629,7 +1631,8 @@ static int migrate_pages_batch(struct list_head *from, nr_retry_pages = 0; list_for_each_entry_safe(folio, folio2, from, lru) { - is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio); + is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); @@ -1649,6 +1652,7 @@ static int migrate_pages_batch(struct list_head *from, stats->nr_thp_failed++; if (!try_split_folio(folio, split_folios)) { stats->nr_thp_split++; + stats->nr_split++; continue; } stats->nr_failed_pages += nr_pages; @@ -1677,11 +1681,12 @@ static int migrate_pages_batch(struct list_head *from, nr_failed++; stats->nr_thp_failed += is_thp; /* Large folio NUMA faulting doesn't split to retry. */ - if (folio_test_large(folio) && !nosplit) { + if (is_large && !nosplit) { int ret = try_split_folio(folio, split_folios); if (!ret) { stats->nr_thp_split += is_thp; + stats->nr_split += is_large; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1827,6 +1832,7 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, stats->nr_succeeded += astats.nr_succeeded; stats->nr_thp_succeeded += astats.nr_thp_succeeded; stats->nr_thp_split += astats.nr_thp_split; + stats->nr_split += astats.nr_split; if (rc < 0) { stats->nr_failed_pages += astats.nr_failed_pages; stats->nr_thp_failed += astats.nr_thp_failed; @@ -1834,7 +1840,11 @@ static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio, return rc; } stats->nr_thp_failed += astats.nr_thp_split; - nr_failed += astats.nr_thp_split; + /* + * Do not count rc, as pages will be retried below. + * Count nr_split only, since it includes nr_thp_split. + */ + nr_failed += astats.nr_split; /* * Fall back to migrate all failed folios one by one synchronously. All * failed folios except split THPs will be retried, so their failure -- cgit v1.2.3 From 49cac03a8f0a56cafa5329911564c97c130ced43 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 17 Oct 2023 12:31:29 -0400 Subject: mm/migrate: add nr_split to trace_mm_migrate_pages stats. Add nr_split to trace_mm_migrate_pages for large folio (including THP) split events. [akpm@linux-foundation.org: cleanup per Huang, Ying] Link: https://lkml.kernel.org/r/20231017163129.2025214-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: "Huang, Ying" Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Huang Ying Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/trace/events/migrate.h | 24 ++++++++++++++---------- mm/migrate.c | 5 +++-- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 061b5128f335..0190ef725b43 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -49,10 +49,11 @@ TRACE_EVENT(mm_migrate_pages, TP_PROTO(unsigned long succeeded, unsigned long failed, unsigned long thp_succeeded, unsigned long thp_failed, - unsigned long thp_split, enum migrate_mode mode, int reason), + unsigned long thp_split, unsigned long large_folio_split, + enum migrate_mode mode, int reason), TP_ARGS(succeeded, failed, thp_succeeded, thp_failed, - thp_split, mode, reason), + thp_split, large_folio_split, mode, reason), TP_STRUCT__entry( __field( unsigned long, succeeded) @@ -60,26 +61,29 @@ TRACE_EVENT(mm_migrate_pages, __field( unsigned long, thp_succeeded) __field( unsigned long, thp_failed) __field( unsigned long, thp_split) + __field( unsigned long, large_folio_split) __field( enum migrate_mode, mode) __field( int, reason) ), TP_fast_assign( - __entry->succeeded = succeeded; - __entry->failed = failed; - __entry->thp_succeeded = thp_succeeded; - __entry->thp_failed = thp_failed; - __entry->thp_split = thp_split; - __entry->mode = mode; - __entry->reason = reason; + __entry->succeeded = succeeded; + __entry->failed = failed; + __entry->thp_succeeded = thp_succeeded; + __entry->thp_failed = thp_failed; + __entry->thp_split = thp_split; + __entry->large_folio_split = large_folio_split; + __entry->mode = mode; + __entry->reason = reason; ), - TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu mode=%s reason=%s", + TP_printk("nr_succeeded=%lu nr_failed=%lu nr_thp_succeeded=%lu nr_thp_failed=%lu nr_thp_split=%lu nr_split=%lu mode=%s reason=%s", __entry->succeeded, __entry->failed, __entry->thp_succeeded, __entry->thp_failed, __entry->thp_split, + __entry->large_folio_split, __print_symbolic(__entry->mode, MIGRATE_MODE), __print_symbolic(__entry->reason, MIGRATE_REASON)) ); diff --git a/mm/migrate.c b/mm/migrate.c index 53244d283c3e..125194f5af0f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1686,7 +1686,7 @@ static int migrate_pages_batch(struct list_head *from, if (!ret) { stats->nr_thp_split += is_thp; - stats->nr_split += is_large; + stats->nr_split++; break; } else if (reason == MR_LONGTERM_PIN && ret == -EAGAIN) { @@ -1979,7 +1979,8 @@ out: count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, stats.nr_thp_succeeded, stats.nr_thp_failed, - stats.nr_thp_split, mode, reason); + stats.nr_thp_split, stats.nr_split, mode, + reason); if (ret_succeeded) *ret_succeeded = stats.nr_succeeded; -- cgit v1.2.3 From c2baef394af88af19e3945f861145679e92ff3e4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 Oct 2023 18:43:54 +0800 Subject: mm: page_alloc: skip memoryless nodes entirely Patch series "handle memoryless nodes more appropriately", v3. Currently, in the process of initialization or offline memory, memoryless nodes will still be built into the fallback list of itself or other nodes. This is not what we expected, so this patch series removes memoryless nodes from the fallback list entirely. This patch (of 2): In find_next_best_node(), we skipped the memoryless nodes when building the zonelists of other normal nodes (N_NORMAL), but did not skip the memoryless node itself when building the zonelist. This will cause it to be traversed at runtime. For example, say we have node0 and node1, node0 is memoryless node, then the fallback order of node0 and node1 as follows: [ 0.153005] Fallback order for Node 0: 0 1 [ 0.153564] Fallback order for Node 1: 1 After this patch, we skip memoryless node0 entirely, then the fallback order of node0 and node1 as follows: [ 0.155236] Fallback order for Node 0: 1 [ 0.155806] Fallback order for Node 1: 1 So it becomes completely invisible, which will reduce runtime overhead. And in this way, we will not try to allocate pages from memoryless node0, then the panic mentioned in [1] will also be fixed. Even though this problem has been solved by dropping the NODE_MIN_SIZE constrain in x86 [2], it would be better to fix it in core MM as well. [1]. https://lore.kernel.org/all/20230212110305.93670-1-zhengqi.arch@bytedance.com/ [2]. https://lore.kernel.org/all/20231017062215.171670-1-rppt@kernel.org/ [zhengqi.arch@bytedance.com: update comment, per Ingo] Link: https://lkml.kernel.org/r/7300fc00a057eefeb9a68c8ad28171c3f0ce66ce.1697799303.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cover.1697799303.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/cover.1697711415.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/157013e978468241de4a4c05d5337a44638ecb0e.1697711415.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Acked-by: Ingo Molnar Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d52718284029..aa90758950a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5020,8 +5020,11 @@ int find_next_best_node(int node, nodemask_t *used_node_mask) int min_val = INT_MAX; int best_node = NUMA_NO_NODE; - /* Use the local node if we haven't already */ - if (!node_isset(node, *used_node_mask)) { + /* + * Use the local node if we haven't already, but for memoryless local + * node, we should skip it and fall back to other nodes. + */ + if (!node_isset(node, *used_node_mask) && node_state(node, N_MEMORY)) { node_set(node, *used_node_mask); return node; } -- cgit v1.2.3 From b7812c86c7403283f8b3775ee11c6c01d457016c Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 19 Oct 2023 18:43:55 +0800 Subject: mm: memory_hotplug: drop memoryless node from fallback lists In offline_pages(), if a node becomes memoryless, we will clear its N_MEMORY state by calling node_states_clear_node(). But we do this after rebuilding the zonelists by calling build_all_zonelists(), which will cause this memoryless node to still be in the fallback nodes (node_order[]) of other nodes. To drop memoryless nodes from fallback nodes in this case, just call node_states_clear_node() before calling build_all_zonelists(). In this way, we will not try to allocate pages from memoryless node0, then the panic mentioned in [1] will also be fixed. Even though this problem has been solved by dropping the NODE_MIN_SIZE constrain in x86 [2], it would be better to fix it in the core MM as well. https://lore.kernel.org/all/20230212110305.93670-1-zhengqi.arch@bytedance.com/ [1] https://lore.kernel.org/all/20231017062215.171670-1-rppt@kernel.org/ [2] Link: https://lkml.kernel.org/r/9f1dbe7ee1301c7163b2770e32954ff5e3ecf2c4.1697711415.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Acked-by: Ingo Molnar Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3b301c4023ff..ab41a511e20a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -2012,12 +2012,16 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, /* reinitialise watermarks and update pcp limits */ init_per_zone_wmark_min(); + /* + * Make sure to mark the node as memory-less before rebuilding the zone + * list. Otherwise this node would still appear in the fallback lists. + */ + node_states_clear_node(node, &arg); if (!populated_zone(zone)) { zone_pcp_reset(zone); build_all_zonelists(NULL); } - node_states_clear_node(node, &arg); if (arg.status_change_nid >= 0) { kcompactd_stop(node); kswapd_stop(node); -- cgit v1.2.3 From 8dd1e896735f6e5abf66525dfd39bbd7b8c0c6d6 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:27 -0700 Subject: mm/khugepaged: convert __collapse_huge_page_isolate() to use folios Patch series "Some khugepaged folio conversions", v3. This patchset converts a number of functions to use folios. This cleans up some khugepaged code and removes a large number of hidden compound_head() calls. This patch (of 5): Replaces 11 calls to compound_head() with 1, and removes 1348 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20231020183331.10770-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/khugepaged.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 88433cc25d8a..500756604488 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -542,6 +542,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, struct list_head *compound_pagelist) { struct page *page = NULL; + struct folio *folio = NULL; pte_t *_pte; int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; @@ -576,7 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE(!PageAnon(page), page); + folio = page_folio(page); + VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio); if (page_mapcount(page) > 1) { ++shared; @@ -588,16 +590,15 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, } } - if (PageCompound(page)) { - struct page *p; - page = compound_head(page); + if (folio_test_large(folio)) { + struct folio *f; /* * Check if we have dealt with the compound page * already */ - list_for_each_entry(p, compound_pagelist, lru) { - if (page == p) + list_for_each_entry(f, compound_pagelist, lru) { + if (folio == f) goto next; } } @@ -608,7 +609,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { + if (!folio_trylock(folio)) { result = SCAN_PAGE_LOCK; goto out; } @@ -624,8 +625,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(page)) { - unlock_page(page); + if (!is_refcount_suitable(&folio->page)) { + folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; } @@ -634,27 +635,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ - if (!isolate_lru_page(page)) { - unlock_page(page); + if (!folio_isolate_lru(folio)) { + folio_unlock(folio); result = SCAN_DEL_PAGE_LRU; goto out; } - mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_lru(page), - compound_nr(page)); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - if (PageCompound(page)) - list_add_tail(&page->lru, compound_pagelist); + if (folio_test_large(folio)) + list_add_tail(&folio->lru, compound_pagelist); next: /* * If collapse was initiated by khugepaged, check that there is * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && - (pte_young(pteval) || page_is_young(page) || - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; @@ -668,13 +669,13 @@ next: result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page, none_or_zero, + trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } out: release_pte_pages(pte, _pte, compound_pagelist); - trace_mm_collapse_huge_page_isolate(page, none_or_zero, + trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero, referenced, writable, result); return result; } -- cgit v1.2.3 From 5c07ebb372d66423e508ecfb8e00324f8797f072 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:28 -0700 Subject: mm/khugepaged: convert hpage_collapse_scan_pmd() to use folios Replaces 5 calls to compound_head(), and removes 1385 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 500756604488..6c4b5af43371 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1248,6 +1248,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; + struct folio *folio = NULL; unsigned long _address; spinlock_t *ptl; int node = NUMA_NO_NODE, unmapped = 0; @@ -1334,29 +1335,28 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, } } - page = compound_head(page); - + folio = page_folio(page); /* * Record which node the original page is from and save this * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ - node = page_to_nid(page); + node = folio_nid(folio); if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } cc->node_load[node]++; - if (!PageLRU(page)) { + if (!folio_test_lru(folio)) { result = SCAN_PAGE_LRU; goto out_unmap; } - if (PageLocked(page)) { + if (folio_test_locked(folio)) { result = SCAN_PAGE_LOCK; goto out_unmap; } - if (!PageAnon(page)) { + if (!folio_test_anon(folio)) { result = SCAN_PAGE_ANON; goto out_unmap; } @@ -1371,7 +1371,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(page)) { + if (!is_refcount_suitable(&folio->page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } @@ -1381,8 +1381,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * enough young pte to justify collapsing the page */ if (cc->is_khugepaged && - (pte_young(pteval) || page_is_young(page) || - PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + (pte_young(pteval) || folio_test_young(folio) || + folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm, address))) referenced++; } @@ -1404,7 +1404,7 @@ out_unmap: *mmap_locked = false; } out: - trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced, none_or_zero, result, unmapped); return result; } -- cgit v1.2.3 From dbf85c21e4aff90912b5d7755d2b25611f9191e9 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:29 -0700 Subject: mm/khugepaged: convert is_refcount_suitable() to use folios Both callers of is_refcount_suitable() have been converted to use folios, so convert it to take in a folio. Both callers only operate on head pages of folios so mapcount/refcount conversions here are trivial. Removes 3 calls to compound head, and removes 315 bytes of kernel text. Link: https://lkml.kernel.org/r/20231020183331.10770-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6c4b5af43371..9efd8ff68f06 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -524,15 +524,15 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte, } } -static bool is_refcount_suitable(struct page *page) +static bool is_refcount_suitable(struct folio *folio) { int expected_refcount; - expected_refcount = total_mapcount(page); - if (PageSwapCache(page)) - expected_refcount += compound_nr(page); + expected_refcount = folio_mapcount(folio); + if (folio_test_swapcache(folio)) + expected_refcount += folio_nr_pages(folio); - return page_count(page) == expected_refcount; + return folio_ref_count(folio) == expected_refcount; } static int __collapse_huge_page_isolate(struct vm_area_struct *vma, @@ -625,7 +625,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * but not from this process. The other process cannot write to * the page, only trigger CoW. */ - if (!is_refcount_suitable(&folio->page)) { + if (!is_refcount_suitable(folio)) { folio_unlock(folio); result = SCAN_PAGE_COUNT; goto out; @@ -1371,7 +1371,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, * has excessive GUP pins (i.e. 512). Anyway the same check * will be done again later the risk seems low. */ - if (!is_refcount_suitable(&folio->page)) { + if (!is_refcount_suitable(folio)) { result = SCAN_PAGE_COUNT; goto out_unmap; } -- cgit v1.2.3 From b455f39d228935f88eebcd1f7c1a6981093c6a3b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:30 -0700 Subject: mm/khugepaged: convert alloc_charge_hpage() to use folios Also remove count_memcg_page_event now that its last caller no longer uses it and reword hpage_collapse_alloc_page() to hpage_collapse_alloc_folio(). This removes 1 call to compound_head() and helps convert khugepaged to use folios throughout. Link: https://lkml.kernel.org/r/20231020183331.10770-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 -------------- mm/khugepaged.c | 17 ++++++++++------- 2 files changed, 10 insertions(+), 21 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b3d2e3e60eed..7bdcf3020d7a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1087,15 +1087,6 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, local_irq_restore(flags); } -static inline void count_memcg_page_event(struct page *page, - enum vm_event_item idx) -{ - struct mem_cgroup *memcg = page_memcg(page); - - if (memcg) - count_memcg_events(memcg, idx, 1); -} - static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { @@ -1598,11 +1589,6 @@ static inline void __count_memcg_events(struct mem_cgroup *memcg, { } -static inline void count_memcg_page_event(struct page *page, - int idx) -{ -} - static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9efd8ff68f06..6a7184cd291b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -888,16 +888,16 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) } #endif -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, +static bool hpage_collapse_alloc_folio(struct folio **folio, gfp_t gfp, int node, nodemask_t *nmask) { - *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); - if (unlikely(!*hpage)) { + *folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, nmask); + + if (unlikely(!*folio)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return false; } - folio_prep_large_rmappable((struct folio *)*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return true; } @@ -1064,17 +1064,20 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, int node = hpage_collapse_find_target_node(cc); struct folio *folio; - if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) + if (!hpage_collapse_alloc_folio(&folio, gfp, node, &cc->alloc_nmask)) { + *hpage = NULL; return SCAN_ALLOC_HUGE_PAGE_FAIL; + } - folio = page_folio(*hpage); if (unlikely(mem_cgroup_charge(folio, mm, gfp))) { folio_put(folio); *hpage = NULL; return SCAN_CGROUP_CHARGE_FAIL; } - count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1); + + *hpage = folio_page(folio, 0); return SCAN_SUCCEED; } -- cgit v1.2.3 From 98b32d296d95d7aa0516c36b72406277412268cd Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Fri, 20 Oct 2023 11:33:31 -0700 Subject: mm/khugepaged: convert collapse_pte_mapped_thp() to use folios This removes 2 calls to compound_head() and helps convert khugepaged to use folios throughout. Previously, if the address passed to collapse_pte_mapped_thp() corresponded to a tail page, the scan would fail immediately. Using filemap_lock_folio() we get the corresponding folio back and try to operate on the folio instead. Link: https://lkml.kernel.org/r/20231020183331.10770-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Rik van Riel Reviewed-by: Yang Shi Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6a7184cd291b..bc2d8ff269c7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1477,7 +1477,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); - struct page *hpage; + struct folio *folio; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; spinlock_t *pml = NULL, *ptl; @@ -1510,19 +1510,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; - hpage = find_lock_page(vma->vm_file->f_mapping, + folio = filemap_lock_folio(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); - if (!hpage) + if (IS_ERR(folio)) return SCAN_PAGE_NULL; - if (!PageHead(hpage)) { - result = SCAN_FAIL; - goto drop_hpage; - } - - if (compound_order(hpage) != HPAGE_PMD_ORDER) { + if (folio_order(folio) != HPAGE_PMD_ORDER) { result = SCAN_PAGE_COMPOUND; - goto drop_hpage; + goto drop_folio; } result = find_pmd_or_thp_or_none(mm, haddr, &pmd); @@ -1536,13 +1531,13 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, */ goto maybe_install_pmd; default: - goto drop_hpage; + goto drop_folio; } result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ - goto drop_hpage; + goto drop_folio; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1567,7 +1562,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * Note that uprobe, debugger, or MAP_PRIVATE may change the * page table, but the new page will not be a subpage of hpage. */ - if (hpage + i != page) + if (folio_page(folio, i) != page) goto abort; } @@ -1582,7 +1577,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, * page_table_lock) ptl nests inside pml. The less time we hold pml, * the better; but userfaultfd's mfill_atomic_pte() on a private VMA * inserts a valid as-if-COWed PTE without even looking up page cache. - * So page lock of hpage does not protect from it, so we must not drop + * So page lock of folio does not protect from it, so we must not drop * ptl before pgt_pmd is removed, so uffd private needs pml taken now. */ if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) @@ -1606,7 +1601,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, continue; /* * We dropped ptl after the first scan, to do the mmu_notifier: - * page lock stops more PTEs of the hpage being faulted in, but + * page lock stops more PTEs of the folio being faulted in, but * does not stop write faults COWing anon copies from existing * PTEs; and does not stop those being swapped out or migrated. */ @@ -1615,7 +1610,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto abort; } page = vm_normal_page(vma, addr, ptent); - if (hpage + i != page) + if (folio_page(folio, i) != page) goto abort; /* @@ -1634,8 +1629,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { - page_ref_sub(hpage, nr_ptes); - add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + folio_ref_sub(folio, nr_ptes); + add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } /* step 4: remove empty page table */ @@ -1659,14 +1654,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd - ? set_huge_pmd(vma, haddr, pmd, hpage) + ? set_huge_pmd(vma, haddr, pmd, &folio->page) : SCAN_SUCCEED; - goto drop_hpage; + goto drop_folio; abort: if (nr_ptes) { flush_tlb_mm(mm); - page_ref_sub(hpage, nr_ptes); - add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + folio_ref_sub(folio, nr_ptes); + add_mm_counter(mm, mm_counter_file(&folio->page), -nr_ptes); } if (start_pte) pte_unmap_unlock(start_pte, ptl); @@ -1674,9 +1669,9 @@ abort: spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); -drop_hpage: - unlock_page(hpage); - put_page(hpage); +drop_folio: + folio_unlock(folio); + folio_put(folio); return result; } -- cgit v1.2.3 From be16dd764a69752a31096d1a6b2ad775b728b1bd Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Mon, 23 Oct 2023 17:44:05 +0500 Subject: mm: fix multiple typos in multiple files Link: https://lkml.kernel.org/r/20231023124405.36981-1-m.muzzammilashraf@gmail.com Signed-off-by: Muhammad Muzammil Reviewed-by: Randy Dunlap Cc: "James E.J. Bottomley" Cc: Matthew Wilcox (Oracle) Cc: Muhammad Muzammil Signed-off-by: Andrew Morton --- mm/debug_vm_pgtable.c | 4 ++-- mm/internal.h | 2 +- mm/memcontrol.c | 4 ++-- mm/mmap.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 48e329ea5ba3..e651500e597a 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -1322,8 +1322,8 @@ static int __init debug_vm_pgtable(void) * true irrespective of the starting protection value for a * given page table entry. * - * Protection based vm_flags combinatins are always linear - * and increasing i.e starting from VM_NONE and going upto + * Protection based vm_flags combinations are always linear + * and increasing i.e starting from VM_NONE and going up to * (VM_SHARED | READ | WRITE | EXEC). */ #define VM_FLAGS_START (VM_NONE) diff --git a/mm/internal.h b/mm/internal.h index c61a98d3b3c7..3eceae1ec4c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -592,7 +592,7 @@ extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags, * range. * "fully mapped" means all the pages of folio is associated with the page * table of range while this function just check whether the folio range is - * within the range [start, end). Funcation caller nees to do page table + * within the range [start, end). Function caller needs to do page table * check if it cares about the page table association. * * Typical usage (like mlock or madvise) is: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b0859b8cc03..774bd6e21e27 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -819,7 +819,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, memcg = pn->memcg; /* - * The caller from rmap relay on disabled preemption becase they never + * The caller from rmap relies on disabled preemption because they never * update their counter from in-interrupt context. For these two * counters we check that the update is never performed from an * interrupt context while other caller need to have disabled interrupt. @@ -8044,7 +8044,7 @@ static struct cftype memsw_files[] = { * * This doesn't check for specific headroom, and it is not atomic * either. But with zswap, the size of the allocation is only known - * once compression has occured, and this optimistic pre-check avoids + * once compression has occurred, and this optimistic pre-check avoids * spending cycles on compression when there is already no room left * or zswap is disabled altogether somewhere in the hierarchy. */ diff --git a/mm/mmap.c b/mm/mmap.c index 8b57e42fd980..984804d77ae1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1223,7 +1223,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * Does the application expect PROT_READ to imply PROT_EXEC? * * (the exception is when the underlying filesystem is noexec - * mounted, in which case we dont add PROT_EXEC.) + * mounted, in which case we don't add PROT_EXEC.) */ if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) if (!(file && path_noexec(&file->f_path))) -- cgit v1.2.3 From 76f26535d1446373d4735a252ea4247c39d64ba6 Mon Sep 17 00:00:00 2001 From: Hyesoo Yu Date: Mon, 23 Oct 2023 17:32:16 +0900 Subject: mm: page_alloc: check the order of compound page even when the order is zero For compound pages, the head sets the PG_head flag and the tail sets the compound_head to indicate the head page. If a user allocates a compound page and frees it with a different order, the compound page information will not be properly initialized. To detect this problem, compound_order(page) and the order argument are compared, but this is not checked when the order argument is zero. That error should be checked regardless of the order. Link: https://lkml.kernel.org/r/20231023083217.1866451-1-hyesoo.yu@samsung.com Signed-off-by: Hyesoo Yu Reviewed-by: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aa90758950a7..5f46c7f85cb1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1079,6 +1079,7 @@ static __always_inline bool free_pages_prepare(struct page *page, int bad = 0; bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); bool init = want_init_on_free(); + bool compound = PageCompound(page); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1097,16 +1098,15 @@ static __always_inline bool free_pages_prepare(struct page *page, return false; } + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + /* * Check tail pages before head page information is cleared to * avoid checking PageCompound for order-0 pages. */ if (unlikely(order)) { - bool compound = PageCompound(page); int i; - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) page[1].flags &= ~PAGE_FLAGS_SECOND; for (i = 1; i < (1 << order); i++) { -- cgit v1.2.3 From e5b16c862884a62c09ab60cbfc6c7b544670bf9e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 22 Oct 2023 20:56:19 +0200 Subject: mm: hugetlb_vmemmap: fix reference to nonexistent file The directory this file is in was renamed but the reference didn't get updated. Fix it. Link: https://lkml.kernel.org/r/20231022185619.919397-1-vegard.nossum@oracle.com Fixes: ee65728e103b ("docs: rename Documentation/vm to Documentation/mm") Signed-off-by: Vegard Nossum Acked-by: Mike Rapoport (IBM) Reviewed-by: Muchun Song Reviewed-by: Rik van Riel Acked-by: Mike Kravetz Cc: Matthew Wilcox Cc: Ira Weiny Cc: Jonathan Corbet Cc: Wu XiangCheng Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h index 9fd0cb270502..2fcae92d3359 100644 --- a/mm/hugetlb_vmemmap.h +++ b/mm/hugetlb_vmemmap.h @@ -12,7 +12,7 @@ /* * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See - * Documentation/vm/vmemmap_dedup.rst. + * Documentation/mm/vmemmap_dedup.rst. */ #define HUGETLB_VMEMMAP_RESERVE_SIZE PAGE_SIZE #define HUGETLB_VMEMMAP_RESERVE_PAGES (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page)) -- cgit v1.2.3 From eebb3dabbb5cc590afe32880b5d3726d0fbf88db Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 21 Oct 2023 12:33:22 +0800 Subject: mm: migrate: record the mlocked page status to remove unnecessary lru drain When doing compaction, I found the lru_add_drain() is an obvious hotspot when migrating pages. The distribution of this hotspot is as follows: - 18.75% compact_zone - 17.39% migrate_pages - 13.79% migrate_pages_batch - 11.66% migrate_folio_move - 7.02% lru_add_drain + 7.02% lru_add_drain_cpu + 3.00% move_to_new_folio 1.23% rmap_walk + 1.92% migrate_folio_unmap + 3.20% migrate_pages_sync + 0.90% isolate_migratepages The lru_add_drain() was added by commit c3096e6782b7 ("mm/migrate: __unmap_and_move() push good newpage to LRU") to drain the newpage to LRU immediately, to help to build up the correct newpage->mlock_count in remove_migration_ptes() for mlocked pages. However, if there are no mlocked pages are migrating, then we can avoid this lru drain operation, especailly for the heavy concurrent scenarios. So we can record the source pages' mlocked status in migrate_folio_unmap(), and only drain the lru list when the mlocked status is set in migrate_folio_move(). In addition, the page was already isolated from lru when migrating, so checking the mlocked status is stable by folio_test_mlocked() in migrate_folio_unmap(). After this patch, I can see the hotpot of the lru_add_drain() is gone: - 9.41% migrate_pages_batch - 6.15% migrate_folio_move - 3.64% move_to_new_folio + 1.80% migrate_folio_extra + 1.70% buffer_migrate_folio + 1.41% rmap_walk + 0.62% folio_add_lru + 3.07% migrate_folio_unmap Meanwhile, the compaction latency shows some improvements when running thpscale: base patched Amean fault-both-1 1131.22 ( 0.00%) 1112.55 * 1.65%* Amean fault-both-3 2489.75 ( 0.00%) 2324.15 * 6.65%* Amean fault-both-5 3257.37 ( 0.00%) 3183.18 * 2.28%* Amean fault-both-7 4257.99 ( 0.00%) 4079.04 * 4.20%* Amean fault-both-12 6614.02 ( 0.00%) 6075.60 * 8.14%* Amean fault-both-18 10607.78 ( 0.00%) 8978.86 * 15.36%* Amean fault-both-24 14911.65 ( 0.00%) 11619.55 * 22.08%* Amean fault-both-30 14954.67 ( 0.00%) 14925.66 * 0.19%* Amean fault-both-32 16654.87 ( 0.00%) 15580.31 * 6.45%* Link: https://lkml.kernel.org/r/06e9153a7a4850352ec36602df3a3a844de45698.1697859741.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: Hugh Dickins Cc: Mel Gorman Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/migrate.c | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 125194f5af0f..35a88334bb3c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1027,22 +1027,28 @@ union migration_ptr { struct anon_vma *anon_vma; struct address_space *mapping; }; + +enum { + PAGE_WAS_MAPPED = BIT(0), + PAGE_WAS_MLOCKED = BIT(1), +}; + static void __migrate_folio_record(struct folio *dst, - unsigned long page_was_mapped, + unsigned long old_page_state, struct anon_vma *anon_vma) { union migration_ptr ptr = { .anon_vma = anon_vma }; dst->mapping = ptr.mapping; - dst->private = (void *)page_was_mapped; + dst->private = (void *)old_page_state; } static void __migrate_folio_extract(struct folio *dst, - int *page_was_mappedp, + int *old_page_state, struct anon_vma **anon_vmap) { union migration_ptr ptr = { .mapping = dst->mapping }; *anon_vmap = ptr.anon_vma; - *page_was_mappedp = (unsigned long)dst->private; + *old_page_state = (unsigned long)dst->private; dst->mapping = NULL; dst->private = NULL; } @@ -1103,7 +1109,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, { struct folio *dst; int rc = -EAGAIN; - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__folio_test_movable(src); bool locked = false; @@ -1157,6 +1163,8 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, folio_lock(src); } locked = true; + if (folio_test_mlocked(src)) + old_page_state |= PAGE_WAS_MLOCKED; if (folio_test_writeback(src)) { /* @@ -1206,7 +1214,7 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, dst_locked = true; if (unlikely(!is_lru)) { - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1232,11 +1240,11 @@ static int migrate_folio_unmap(new_folio_t get_new_folio, VM_BUG_ON_FOLIO(folio_test_anon(src) && !folio_test_ksm(src) && !anon_vma, src); try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0); - page_was_mapped = 1; + old_page_state |= PAGE_WAS_MAPPED; } if (!folio_mapped(src)) { - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return MIGRATEPAGE_UNMAP; } @@ -1248,7 +1256,8 @@ out: if (rc == -EAGAIN) ret = NULL; - migrate_folio_undo_src(src, page_was_mapped, anon_vma, locked, ret); + migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + anon_vma, locked, ret); migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private); return rc; @@ -1261,12 +1270,12 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, struct list_head *ret) { int rc; - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; bool is_lru = !__folio_test_movable(src); struct list_head *prev; - __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); + __migrate_folio_extract(dst, &old_page_state, &anon_vma); prev = dst->lru.prev; list_del(&dst->lru); @@ -1287,10 +1296,10 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, * isolated from the unevictable LRU: but this case is the easiest. */ folio_add_lru(dst); - if (page_was_mapped) + if (old_page_state & PAGE_WAS_MLOCKED) lru_add_drain(); - if (page_was_mapped) + if (old_page_state & PAGE_WAS_MAPPED) remove_migration_ptes(src, dst, false); out_unlock_both: @@ -1322,11 +1331,12 @@ out: */ if (rc == -EAGAIN) { list_add(&dst->lru, prev); - __migrate_folio_record(dst, page_was_mapped, anon_vma); + __migrate_folio_record(dst, old_page_state, anon_vma); return rc; } - migrate_folio_undo_src(src, page_was_mapped, anon_vma, true, ret); + migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret); migrate_folio_undo_dst(dst, true, put_new_folio, private); return rc; @@ -1799,12 +1809,12 @@ out: dst = list_first_entry(&dst_folios, struct folio, lru); dst2 = list_next_entry(dst, lru); list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) { - int page_was_mapped = 0; + int old_page_state = 0; struct anon_vma *anon_vma = NULL; - __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); - migrate_folio_undo_src(folio, page_was_mapped, anon_vma, - true, ret_folios); + __migrate_folio_extract(dst, &old_page_state, &anon_vma); + migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED, + anon_vma, true, ret_folios); list_del(&dst->lru); migrate_folio_undo_dst(dst, true, put_new_folio, private); dst = dst2; -- cgit v1.2.3 From 1cbf0a58847b30507611f92ad69964ef37264d14 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Oct 2023 23:26:08 -0700 Subject: ext4: add __GFP_NOWARN to GFP_NOWAIT in readahead Since commit e509ad4d77e6 ("ext4: use bdev_getblk() to avoid memory reclaim in readahead path") rightly replaced GFP_NOFAIL allocations by GFP_NOWAIT allocations, I've occasionally been seeing "page allocation failure: order:0" warnings under load: all with ext4_sb_breadahead_unmovable() in the stack. I don't think those warnings are of any interest: suppress them with __GFP_NOWARN. Link: https://lkml.kernel.org/r/7bc6ad16-9a4d-dd90-202e-47d6cbb5a136@google.com Fixes: e509ad4d77e6 ("ext4: use bdev_getblk() to avoid memory reclaim in readahead path") Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Hui Zhu Cc: Matthew Wilcox (Oracle) Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c00ec159dea5..56a08fc5c5d5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -262,7 +262,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, - sb->s_blocksize, GFP_NOWAIT); + sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) -- cgit v1.2.3 From b1454b463c217e5bc553acc44b2389d9257c9708 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Oct 2023 23:38:41 -0700 Subject: mm: mlock: avoid folio_within_range() on KSM pages Since commit dc68badcede4 ("mm: mlock: update mlock_pte_range to handle large folio") I've just occasionally seen VM_WARN_ON_FOLIO(folio_test_ksm) warnings from folio_within_range(), in a splurge after testing with KSM hyperactive. folio_referenced_one()'s use of folio_within_vma() is safe because it checks folio_test_large() first; but allow_mlock_munlock() needs to do the same to avoid those warnings (or check !folio_test_ksm() itself? Or move either check into folio_within_range()? Hard to tell without more examples of its use). Link: https://lkml.kernel.org/r/23852f6a-5bfa-1ffd-30db-30c5560ad426@google.com Fixes: dc68badcede4 ("mm: mlock: update mlock_pte_range to handle large folio") Signed-off-by: Hugh Dickins Reviewed-by: Yin Fengwei Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Stefan Roesch Signed-off-by: Andrew Morton --- mm/mlock.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/mlock.c b/mm/mlock.c index aa44456200e3..086546ac5766 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -346,6 +346,10 @@ static inline bool allow_mlock_munlock(struct folio *folio, if (!(vma->vm_flags & VM_LOCKED)) return true; + /* folio_within_range() cannot take KSM, but any small folio is OK */ + if (!folio_test_large(folio)) + return true; + /* folio not in range [start, end), skip mlock */ if (!folio_within_range(folio, vma, start, end)) return false; -- cgit v1.2.3 From 35f5d94187a6a3a8df2cba54beccca1c2379edb8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:20 +0000 Subject: mm/damon: implement a function for max nr_accesses safe calculation Patch series "avoid divide-by-zero due to max_nr_accesses overflow". The maximum nr_accesses of given DAMON context can be calculated by dividing the aggregation interval by the sampling interval. Some logics in DAMON uses the maximum nr_accesses as a divisor. Hence, the value shouldn't be zero. Such case is avoided since DAMON avoids setting the agregation interval as samller than the sampling interval. However, since nr_accesses is unsigned int while the intervals are unsigned long, the maximum nr_accesses could be zero while casting. Avoid the divide-by-zero by implementing a function that handles the corner case (first patch), and replaces the vulnerable direct max nr_accesses calculations (remaining patches). Note that the patches for the replacements are divided for broken commits, to make backporting on required tres easier. Especially, the last patch is for a patch that not yet merged into the mainline but in mm tree. This patch (of 4): The maximum nr_accesses of given DAMON context can be calculated by dividing the aggregation interval by the sampling interval. Some logics in DAMON uses the maximum nr_accesses as a divisor. Hence, the value shouldn't be zero. Such case is avoided since DAMON avoids setting the agregation interval as samller than the sampling interval. However, since nr_accesses is unsigned int while the intervals are unsigned long, the maximum nr_accesses could be zero while casting. Implement a function that handles the corner case. Note that this commit is not fixing the real issue since this is only introducing the safe function that will replaces the problematic divisions. The replacements will be made by followup commits, to make backporting on stable series easier. Link: https://lkml.kernel.org/r/20231019194924.100347-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231019194924.100347-2-sj@kernel.org Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [5.16+] Signed-off-by: Andrew Morton --- include/linux/damon.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 27b995c22497..ab2f17d9926b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -681,6 +681,13 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) return ctx->ops.id == DAMON_OPS_VADDR || ctx->ops.id == DAMON_OPS_FVADDR; } +static inline unsigned int damon_max_nr_accesses(const struct damon_attrs *attrs) +{ + /* {aggr,sample}_interval are unsigned long, hence could overflow */ + return min(attrs->aggr_interval / attrs->sample_interval, + (unsigned long)UINT_MAX); +} + int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -- cgit v1.2.3 From d35963bfb05877455228ecec6b194f624489f96a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:21 +0000 Subject: mm/damon/core: avoid divide-by-zero during monitoring results update When monitoring attributes are changed, DAMON updates access rate of the monitoring results accordingly. For that, it divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-3-sj@kernel.org Fixes: 2f5bef5a590b ("mm/damon/core: update monitoring results for new monitoring attributes") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [6.3+] Signed-off-by: Andrew Morton --- mm/damon/core.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 9f4f7c378cf3..e194c8075235 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -500,20 +500,14 @@ static unsigned int damon_age_for_new_attrs(unsigned int age, static unsigned int damon_accesses_bp_to_nr_accesses( unsigned int accesses_bp, struct damon_attrs *attrs) { - unsigned int max_nr_accesses = - attrs->aggr_interval / attrs->sample_interval; - - return accesses_bp * max_nr_accesses / 10000; + return accesses_bp * damon_max_nr_accesses(attrs) / 10000; } /* convert nr_accesses to access ratio in bp (per 10,000) */ static unsigned int damon_nr_accesses_to_accesses_bp( unsigned int nr_accesses, struct damon_attrs *attrs) { - unsigned int max_nr_accesses = - attrs->aggr_interval / attrs->sample_interval; - - return nr_accesses * 10000 / max_nr_accesses; + return nr_accesses * 10000 / damon_max_nr_accesses(attrs); } static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, -- cgit v1.2.3 From 3bafc47d3c4a2fc4d3b382aeb3c087f8fc84d9fd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:22 +0000 Subject: mm/damon/ops-common: avoid divide-by-zero during region hotness calculation When calculating the hotness of each region for the under-quota regions prioritization, DAMON divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-4-sj@kernel.org Fixes: 198f0f4c58b9 ("mm/damon/vaddr,paddr: support pageout prioritization") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [5.16+] Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index ac1c3fa80f98..d25d99cb5f2b 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -73,7 +73,6 @@ void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { - unsigned int max_nr_accesses; int freq_subscore; unsigned int age_in_sec; int age_in_log, age_subscore; @@ -81,8 +80,8 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; - freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; + freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / + damon_max_nr_accesses(&c->attrs); age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; -- cgit v1.2.3 From 44063f125af4bb4efd1d500d8091fa33a98af325 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:23 +0000 Subject: mm/damon/lru_sort: avoid divide-by-zero in hot threshold calculation When calculating the hotness threshold for lru_prio scheme of DAMON_LRU_SORT, the module divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Link: https://lkml.kernel.org/r/20231019194924.100347-5-sj@kernel.org Fixes: 40e983cca927 ("mm/damon: introduce DAMON-based LRU-lists Sorting") Signed-off-by: SeongJae Park Reported-by: Jakub Acs Cc: [6.0+] Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 3ecdcc029443..f2e5f9431892 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -195,9 +195,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = damon_lru_sort_mon_attrs.aggr_interval / - damon_lru_sort_mon_attrs.sample_interval * + hot_thres = damon_max_nr_accesses(&damon_lru_sort_mon_attrs) * hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) -- cgit v1.2.3 From 62f76a7b53bfa2ecfe1570a5b1d0d574c576a56d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 19 Oct 2023 19:49:24 +0000 Subject: mm/damon/core: avoid divide-by-zero from pseudo-moving window length calculation When calculating the pseudo-moving access rate, DAMON divides some values by the maximum nr_accesses. However, due to the type of the related variables, simple division-based calculation of the divisor can return zero. As a result, divide-by-zero is possible. Fix it by using damon_max_nr_accesses(), which handles the case. Note that this is a fix for a commit that not in the mainline but mm tree. Link: https://lkml.kernel.org/r/20231019194924.100347-6-sj@kernel.org Fixes: ace30fb21af5 ("mm/damon/core: use pseudo-moving sum for nr_accesses_bp") Reported-by: Jakub Acs Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index e194c8075235..aa2dc7087cd9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1665,7 +1665,7 @@ void damon_update_region_access_rate(struct damon_region *r, bool accessed, * aggr_interval, owing to validation of damon_set_attrs(). */ if (attrs->sample_interval) - len_window = attrs->aggr_interval / attrs->sample_interval; + len_window = damon_max_nr_accesses(attrs); r->nr_accesses_bp = damon_moving_sum(r->nr_accesses_bp, r->last_nr_accesses * 10000, len_window, accessed ? 10000 : 0); -- cgit v1.2.3 From b8ee5575f763c239902f8523d82103a45c153b29 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Oct 2023 21:07:34 +0000 Subject: mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets() damon_sysfs_set_targets() had a bug that can result in unexpected memory usage and monitoring overhead increase. The bug has fixed by a previous commit. Add a unit test for avoiding a similar bug of future. Link: https://lkml.kernel.org/r/20231022210735.46409-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 12 +++++++ mm/damon/sysfs-test.h | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/damon/sysfs.c | 2 ++ 3 files changed, 100 insertions(+) create mode 100644 mm/damon/sysfs-test.h diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 436c6b4cb5ec..29f43fbc2eff 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -59,6 +59,18 @@ config DAMON_SYSFS This builds the sysfs interface for DAMON. The user space can use the interface for arbitrary data access monitoring. +config DAMON_SYSFS_KUNIT_TEST + bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + depends on DAMON_SYSFS && KUNIT=y + default KUNIT_ALL_TESTS + help + This builds the DAMON sysfs interface Kunit test suite. + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation. + + If unsure, say N. + config DAMON_DBGFS bool "DAMON debugfs interface (DEPRECATED!)" depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS diff --git a/mm/damon/sysfs-test.h b/mm/damon/sysfs-test.h new file mode 100644 index 000000000000..73bdce2452c1 --- /dev/null +++ b/mm/damon/sysfs-test.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Data Access Monitor Unit Tests + * + * Author: SeongJae Park + */ + +#ifdef CONFIG_DAMON_SYSFS_KUNIT_TEST + +#ifndef _DAMON_SYSFS_TEST_H +#define _DAMON_SYSFS_TEST_H + +#include + +static unsigned int nr_damon_targets(struct damon_ctx *ctx) +{ + struct damon_target *t; + unsigned int nr_targets = 0; + + damon_for_each_target(t, ctx) + nr_targets++; + + return nr_targets; +} + +static int __damon_sysfs_test_get_any_pid(int min, int max) +{ + struct pid *pid; + int i; + + for (i = min; i <= max; i++) { + pid = find_get_pid(i); + if (pid) { + put_pid(pid); + return i; + } + } + return -1; +} + +static void damon_sysfs_test_set_targets(struct kunit *test) +{ + struct damon_sysfs_targets *sysfs_targets; + struct damon_sysfs_target *sysfs_target; + struct damon_ctx *ctx; + + sysfs_targets = damon_sysfs_targets_alloc(); + sysfs_targets->nr = 1; + sysfs_targets->targets_arr = kmalloc_array(1, + sizeof(*sysfs_targets->targets_arr), GFP_KERNEL); + + sysfs_target = damon_sysfs_target_alloc(); + sysfs_target->pid = __damon_sysfs_test_get_any_pid(12, 100); + sysfs_target->regions = damon_sysfs_regions_alloc(); + sysfs_targets->targets_arr[0] = sysfs_target; + + ctx = damon_new_ctx(); + + damon_sysfs_set_targets(ctx, sysfs_targets); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); + + sysfs_target->pid = __damon_sysfs_test_get_any_pid( + sysfs_target->pid + 1, 200); + damon_sysfs_set_targets(ctx, sysfs_targets); + KUNIT_EXPECT_EQ(test, 1u, nr_damon_targets(ctx)); + + damon_destroy_ctx(ctx); + kfree(sysfs_targets->targets_arr); + kfree(sysfs_targets); + kfree(sysfs_target); +} + +static struct kunit_case damon_sysfs_test_cases[] = { + KUNIT_CASE(damon_sysfs_test_set_targets), + {}, +}; + +static struct kunit_suite damon_sysfs_test_suite = { + .name = "damon-sysfs", + .test_cases = damon_sysfs_test_cases, +}; +kunit_test_suite(damon_sysfs_test_suite); + +#endif /* _DAMON_SYSFS_TEST_H */ + +#endif /* CONFIG_DAMON_SYSFS_KUNIT_TEST */ diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index f73dc88d2d19..5846970cedce 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1836,3 +1836,5 @@ out: return err; } subsys_initcall(damon_sysfs_init); + +#include "sysfs-test.h" -- cgit v1.2.3 From 10969b5571387047cd461a9c701b8b9cd007f6c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:15:09 -0700 Subject: hugetlbfs: drop shared NUMA mempolicy pretence Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2. Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma from shmem folio allocation, and removing the mmap_lock around folio migration for mbind and migrate_pages syscalls. This patch (of 12): hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA mempolicy for this file? hugetlb_vm_ops has never offered a set_policy method, and hugetlbfs_parse_param() has never supported any mpol options for a mount-wide default policy. It's just an illusion: clean it away so as not to confuse others, giving us more freedom to adjust shmem's set_policy/get_policy implementation. But hugetlbfs_inode_info is still required, just to accommodate seals. Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a set_policy method and/or mpol mount option (Andi's first posting did include an admitted-unsatisfactory hugetlb_set_policy()); but it seems that nobody has bothered to add that in the nineteen years since v2.6.7 made it possible, and there is at least one company that has invested enough into hugetlbfs, that I guess they have learnt well enough how to manage its NUMA, without needing shared mempolicy. Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in its place, because hugetlb.h's recently added use of filemap_lock_folio() requires that (although most .configs and .c's get it in some other way). Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 41 +---------------------------------------- include/linux/hugetlb.h | 3 +-- 2 files changed, 2 insertions(+), 42 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 85cd418d2340..3ad5fc3cb8db 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { {} }; -#ifdef CONFIG_NUMA -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ - vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, - index); -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ - mpol_cond_put(vma->vm_policy); -} -#else -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ -} -#endif - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -853,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* * Initialize a pseudo vma as this is required by the huge page - * allocation routines. If NUMA is configured, use page index - * as input to create an allocation policy. + * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); @@ -902,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); - hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); @@ -1283,18 +1257,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } - - /* - * Any time after allocation, hugetlbfs_destroy_inode can be called - * for the inode. mpol_free_shared_policy is unconditionally called - * as part of hugetlbfs_destroy_inode. So, initialize policy here - * in case of a quick call to destroy. - * - * Note that the policy is initialized even if we are creating a - * private inode. This simplifies hugetlbfs_destroy_inode. - */ - mpol_shared_policy_init(&p->policy, NULL); - return &p->vfs_inode; } @@ -1306,7 +1268,6 @@ static void hugetlbfs_free_inode(struct inode *inode) static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 205469aa0613..158ff156710b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -#include +#include #include #include @@ -545,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) } struct hugetlbfs_inode_info { - struct shared_policy policy; struct inode vfs_inode; unsigned int seals; }; -- cgit v1.2.3 From 4b981bc1aa73c204c2aa7f99b5f4f74d03b0e381 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:16:29 -0700 Subject: kernfs: drop shared NUMA mempolicy hooks It seems strange that kernfs should be an outlier with a set_policy and get_policy in its kernfs_vm_ops. Ah, it dates back to v2.6.30's commit 095160aee954 ("sysfs: fix some bin_vm_ops errors"), when I had crashed on powerpc's pci_mmap_legacy_page_range() fallback to shmem_zero_setup(). Well, that was commendably thorough, to give sysfs-bin a set_policy and get_policy, just to avoid the way it was coded resulting in EINVAL from mmap when CONFIG_NUMA; but somehow feels a bit over-the-top to me now. It's easier to say that nobody should expect to manage a shmem object's shared NUMA mempolicy via some kernfs backdoor to that object: delete that code (and there's no longer an EINVAL from mmap in the NUMA case). This then leaves set_policy/get_policy as implemented only by shmem - though importantly also by SysV SHM, which has to interface with shmem which implements them, and with SHM_HUGETLB which does not. Link: https://lkml.kernel.org/r/302164-a760-4a9e-879b-6870c9b4013@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/kernfs/file.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 180906c36f51..aaa76410e550 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, return ret; } -#ifdef CONFIG_NUMA -static int kernfs_vma_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - kernfs_put_active(of->kn); - return ret; -} - -static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!kernfs_get_active(of->kn)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - kernfs_put_active(of->kn); - return pol; -} - -#endif - static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, -#ifdef CONFIG_NUMA - .set_policy = kernfs_vma_set_policy, - .get_policy = kernfs_vma_get_policy, -#endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From 1cb5d11a370f661c5d0d888bb0cfc2cdc5791382 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:17:43 -0700 Subject: mempolicy: fix migrate_pages(2) syscall return nr_failed "man 2 migrate_pages" says "On success migrate_pages() returns the number of pages that could not be moved". Although 5.3 and 5.4 commits fixed mbind(MPOL_MF_STRICT|MPOL_MF_MOVE*) to fail with EIO when not all pages could be moved (because some could not be isolated for migration), migrate_pages(2) was left still reporting only those pages failing at the migration stage, forgetting those failing at the earlier isolation stage. Fix that by accumulating a long nr_failed count in struct queue_pages, returned by queue_pages_range() when it's not returning an error, for adding on to the nr_failed count from migrate_pages() in mm/migrate.c. A count of pages? It's more a count of folios, but changing it to pages would entail more work (also in mm/migrate.c): does not seem justified. queue_pages_range() itself should only return -EIO in the "strictly unmovable" case (STRICT without any MOVEs): in that case it's best to break out as soon as nr_failed gets set; but otherwise it should continue to isolate pages for MOVing even when nr_failed - as the mbind(2) manpage promises. There's a case when nr_failed should be incremented when it was missed: queue_folios_pte_range() and queue_folios_hugetlb() count the transient migration entries, like queue_folios_pmd() already did. And there's a case when nr_failed should not be incremented when it would have been: in meeting later PTEs of the same large folio, which can only be isolated once: fixed by recording the current large folio in struct queue_pages. Clean up the affected functions, fixing or updating many comments. Bool migrate_folio_add(), without -EIO: true if adding, or if skipping shared (but its arguable folio_estimated_sharers() heuristic left unchanged). Use MPOL_MF_WRLOCK flag to queue_pages_range(), instead of bool lock_vma. Use explicit STRICT|MOVE* flags where queue_pages_test_walk() checks for skipping, instead of hiding them behind MPOL_MF_VALID. Link: https://lkml.kernel.org/r/9a6b0b9-3bb-dbef-8adf-efab4397b8d@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: "Huang, Ying" Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 338 +++++++++++++++++++++++++++------------------------------ 1 file changed, 159 insertions(+), 179 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8bb5b4b4b395..b6d82c8bc479 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -111,7 +111,8 @@ /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ -#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_WRLOCK (MPOL_MF_INTERNAL << 2) /* Write-lock walked vmas */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; @@ -416,9 +417,19 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { }, }; -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static bool strictly_unmovable(unsigned long flags) +{ + /* + * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO + * if any misplaced page is found. + */ + return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) == + MPOL_MF_STRICT; +} + struct queue_pages { struct list_head *pagelist; unsigned long flags; @@ -426,7 +437,8 @@ struct queue_pages { unsigned long start; unsigned long end; struct vm_area_struct *first; - bool has_unmovable; + struct folio *large; /* note last large folio encountered */ + long nr_failed; /* could not be isolated at this time */ }; /* @@ -444,61 +456,37 @@ static inline bool queue_folio_required(struct folio *folio, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } -/* - * queue_folios_pmd() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an - * existing folio was already on a node that does not follow the - * policy. - */ -static int queue_folios_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, - unsigned long end, struct mm_walk *walk) - __releases(ptl) +static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk) { - int ret = 0; struct folio *folio; struct queue_pages *qp = walk->private; - unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = -EIO; - goto unlock; + qp->nr_failed++; + return; } folio = pfn_folio(pmd_pfn(*pmd)); if (is_huge_zero_page(&folio->page)) { walk->action = ACTION_CONTINUE; - goto unlock; + return; } if (!queue_folio_required(folio, qp)) - goto unlock; - - flags = qp->flags; - /* go to folio migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - if (!vma_migratable(walk->vma) || - migrate_folio_add(folio, qp->pagelist, flags)) { - qp->has_unmovable = true; - goto unlock; - } - } else - ret = -EIO; -unlock: - spin_unlock(ptl); - return ret; + return; + if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma) || + !migrate_folio_add(folio, qp->pagelist, qp->flags)) + qp->nr_failed++; } /* - * Scan through pages checking if pages follow certain conditions, - * and move them to the pagelist if they do. + * Scan through folios, checking if they satisfy the required conditions, + * moving them from LRU to local pagelist for migration if they do (or not). * - * queue_folios_pte_range() has three possible return values: - * 0 - folios are placed on the right node or queued successfully, or - * special page is met, i.e. zero page, or unmovable page is found - * but continue walking (indicated by queue_pages.has_unmovable). - * -EIO - only MPOL_MF_STRICT was specified and an existing folio was already - * on a node that does not follow the policy. + * queue_folios_pte_range() has two possible return values: + * 0 - continue walking to scan for more, even if an existing folio on the + * wrong node could not be isolated and queued for migration. + * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL, + * and an existing folio was on a node that does not follow the policy. */ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -512,8 +500,11 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) - return queue_folios_pmd(pmd, ptl, addr, end, walk); + if (ptl) { + queue_folios_pmd(pmd, walk); + spin_unlock(ptl); + goto out; + } mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); if (!pte) { @@ -522,8 +513,13 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, } for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = ptep_get(pte); - if (!pte_present(ptent)) + if (pte_none(ptent)) continue; + if (!pte_present(ptent)) { + if (is_migration_entry(pte_to_swp_entry(ptent))) + qp->nr_failed++; + continue; + } folio = vm_normal_folio(vma, addr, ptent); if (!folio || folio_is_zone_device(folio)) continue; @@ -535,95 +531,87 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_folio_required(folio, qp)) continue; - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - /* - * MPOL_MF_STRICT must be specified if we get here. - * Continue walking vmas due to MPOL_MF_MOVE* flags. - */ - if (!vma_migratable(vma)) - qp->has_unmovable = true; - + if (folio_test_large(folio)) { /* - * Do not abort immediately since there may be - * temporary off LRU pages in the range. Still - * need migrate other LRU pages. + * A large folio can only be isolated from LRU once, + * but may be mapped by many PTEs (and Copy-On-Write may + * intersperse PTEs of other, order 0, folios). This is + * a common case, so don't mistake it for failure (but + * there can be other cases of multi-mapped pages which + * this quick check does not help to filter out - and a + * search of the pagelist might grow to be prohibitive). + * + * migrate_pages(&pagelist) returns nr_failed folios, so + * check "large" now so that queue_pages_range() returns + * a comparable nr_failed folios. This does imply that + * if folio could not be isolated for some racy reason + * at its first PTE, later PTEs will not give it another + * chance of isolation; but keeps the accounting simple. */ - if (migrate_folio_add(folio, qp->pagelist, flags)) - qp->has_unmovable = true; - } else - break; + if (folio == qp->large) + continue; + qp->large = folio; + } + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(vma) || + !migrate_folio_add(folio, qp->pagelist, flags)) { + qp->nr_failed++; + if (strictly_unmovable(flags)) + break; + } } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); - - return addr != end ? -EIO : 0; +out: + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; + return 0; } static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; - unsigned long flags = (qp->flags & MPOL_MF_VALID); + unsigned long flags = qp->flags; struct folio *folio; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); - if (!pte_present(entry)) + if (!pte_present(entry)) { + if (unlikely(is_hugetlb_entry_migration(entry))) + qp->nr_failed++; goto unlock; + } folio = pfn_folio(pte_pfn(entry)); if (!queue_folio_required(folio, qp)) goto unlock; - - if (flags == MPOL_MF_STRICT) { - /* - * STRICT alone means only detecting misplaced folio and no - * need to further check other vma. - */ - ret = -EIO; + if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || + !vma_migratable(walk->vma)) { + qp->nr_failed++; goto unlock; } - - if (!vma_migratable(walk->vma)) { - /* - * Must be STRICT with MOVE*, otherwise .test_walk() have - * stopped walking current vma. - * Detecting misplaced folio but allow migrating folios which - * have been queued. - */ - qp->has_unmovable = true; - goto unlock; - } - /* - * With MPOL_MF_MOVE, we try to migrate only unshared folios. If it - * is shared it is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * expensive, so check the estimated sharers of the folio instead. */ - if (flags & (MPOL_MF_MOVE_ALL) || - (flags & MPOL_MF_MOVE && folio_estimated_sharers(folio) == 1 && - !hugetlb_pmd_shared(pte))) { - if (!isolate_hugetlb(folio, qp->pagelist) && - (flags & MPOL_MF_STRICT)) - /* - * Failed to isolate folio but allow migrating pages - * which have been queued. - */ - qp->has_unmovable = true; - } + if ((flags & MPOL_MF_MOVE_ALL) || + (folio_estimated_sharers(folio) == 1 && !hugetlb_pmd_shared(pte))) + if (!isolate_hugetlb(folio, qp->pagelist)) + qp->nr_failed++; unlock: spin_unlock(ptl); -#else - BUG(); + if (qp->nr_failed && strictly_unmovable(flags)) + return -EIO; #endif - return ret; + return 0; } #ifdef CONFIG_NUMA_BALANCING @@ -704,8 +692,11 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, return 1; } - /* queue pages from current vma */ - if (flags & MPOL_MF_VALID) + /* + * Check page nodes, and queue pages to move, in the current vma. + * But if no moving, and no strict checking, the scan can be skipped. + */ + if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) return 0; return 1; } @@ -727,22 +718,21 @@ static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { /* * Walk through page tables and collect pages to be migrated. * - * If pages found in a given range are on a set of nodes (determined by - * @nodes and @flags,) it's isolated and queued to the pagelist which is - * passed via @private. + * If pages found in a given range are not on the required set of @nodes, + * and migration is allowed, they are isolated and queued to @pagelist. * - * queue_pages_range() has three possible return values: - * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were - * specified. - * 0 - queue pages successfully or no misplaced page. - * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or - * memory range specified by nodemask and maxnode points outside - * your accessible address space (-EFAULT) + * queue_pages_range() may return: + * 0 - all pages already on the right node, or successfully queued for moving + * (or neither strict checking nor moving requested: only range checking). + * >0 - this number of misplaced folios could not be queued for moving + * (a hugetlbfs page or a transparent huge page being counted as 1). + * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs. + * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified. */ -static int +static long queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist, bool lock_vma) + struct list_head *pagelist) { int err; struct queue_pages qp = { @@ -752,20 +742,17 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .start = start, .end = end, .first = NULL, - .has_unmovable = false, }; - const struct mm_walk_ops *ops = lock_vma ? + const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ? &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; err = walk_page_range(mm, start, end, ops, &qp); - if (qp.has_unmovable) - err = 1; if (!qp.first) /* whole range in hole */ err = -EFAULT; - return err; + return err ? : qp.nr_failed; } /* @@ -1008,16 +995,16 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } #ifdef CONFIG_MIGRATION -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { /* - * We try to migrate only unshared folios. If it is shared it - * is likely not worth migrating. + * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio. + * Choosing not to migrate a shared folio is not counted as a failure. * * To check if the folio is shared, ideally we want to make sure * every page is mapped to the same process. Doing that is very - * expensive, so check the estimated mapcount of the folio instead. + * expensive, so check the estimated sharers of the folio instead. */ if ((flags & MPOL_MF_MOVE_ALL) || folio_estimated_sharers(folio) == 1) { if (folio_isolate_lru(folio)) { @@ -1025,32 +1012,31 @@ static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); - } else if (flags & MPOL_MF_STRICT) { + } else { /* * Non-movable folio may reach here. And, there may be * temporary off LRU folios or non-LRU movable folios. * Treat them as unmovable folios since they can't be - * isolated, so they can't be moved at the moment. It - * should return -EIO for this case too. + * isolated, so they can't be moved at the moment. */ - return -EIO; + return false; } } - - return 0; + return true; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -static int migrate_to_node(struct mm_struct *mm, int source, int dest, - int flags) +static long migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; struct vm_area_struct *vma; LIST_HEAD(pagelist); - int err = 0; + long nr_failed; + long err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, @@ -1059,23 +1045,27 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodes_clear(nmask); node_set(source, nmask); + VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + vma = find_vma(mm, 0); + /* - * This does not "check" the range but isolates all pages that + * This does not migrate the range, but isolates all pages that * need migration. Between passing in the full user address - * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. + * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail, + * but passes back the count of pages which could not be isolated. */ - vma = find_vma(mm, 0); - VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); + nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, - (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); + (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL); if (err) putback_movable_pages(&pagelist); } + if (err >= 0) + err += nr_failed; return err; } @@ -1088,8 +1078,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { - int busy = 0; - int err = 0; + long nr_failed = 0; + long err = 0; nodemask_t tmp; lru_cache_disable(); @@ -1171,7 +1161,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) - busy += err; + nr_failed += err; if (err < 0) break; } @@ -1180,8 +1170,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_enable(); if (err < 0) return err; - return busy; - + return (nr_failed < INT_MAX) ? nr_failed : INT_MAX; } /* @@ -1220,10 +1209,10 @@ static struct folio *new_folio(struct folio *src, unsigned long start) } #else -static int migrate_folio_add(struct folio *folio, struct list_head *foliolist, +static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags) { - return -EIO; + return false; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, @@ -1247,8 +1236,8 @@ static long do_mbind(unsigned long start, unsigned long len, struct vma_iterator vmi; struct mempolicy *new; unsigned long end; - int err; - int ret; + long err; + long nr_failed; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) @@ -1288,10 +1277,8 @@ static long do_mbind(unsigned long start, unsigned long len, start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); - } { NODEMASK_SCRATCH(scratch); if (scratch) { @@ -1307,44 +1294,37 @@ static long do_mbind(unsigned long start, unsigned long len, goto mpol_out; /* - * Lock the VMAs before scanning for pages to migrate, to ensure we don't - * miss a concurrently inserted page. + * Lock the VMAs before scanning for pages to migrate, + * to ensure we don't miss a concurrently inserted page. */ - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist, true); + nr_failed = queue_pages_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist); - if (ret < 0) { - err = ret; - goto up_out; - } - - vma_iter_init(&vmi, mm, start); - prev = vma_prev(&vmi); - for_each_vma_range(vmi, vma, end) { - err = mbind_range(&vmi, vma, &prev, start, end, new); - if (err) - break; + if (nr_failed < 0) { + err = nr_failed; + } else { + vma_iter_init(&vmi, mm, start); + prev = vma_prev(&vmi); + for_each_vma_range(vmi, vma, end) { + err = mbind_range(&vmi, vma, &prev, start, end, new); + if (err) + break; + } } if (!err) { - int nr_failed = 0; - if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); - nr_failed = migrate_pages(&pagelist, new_folio, NULL, + nr_failed |= migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); - if (nr_failed) - putback_movable_pages(&pagelist); } - - if (((ret > 0) || nr_failed) && (flags & MPOL_MF_STRICT)) + if (nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; - } else { -up_out: - if (!list_empty(&pagelist)) - putback_movable_pages(&pagelist); } + if (!list_empty(&pagelist)) + putback_movable_pages(&pagelist); + mmap_write_unlock(mm); mpol_out: mpol_put(new); -- cgit v1.2.3 From 7f1ee4e2070883d18a431c761db8bb30a958b654 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:19:00 -0700 Subject: mempolicy trivia: delete those ancient pr_debug()s Delete those ancient pr_debug()s - PDprintk()s in Andi Kleen's original submission of core NUMA API, and useful when debugging shared mempolicy lifetime back then, but not used recently. Link: https://lkml.kernel.org/r/f25135-ffb2-40d8-9577-720772b333@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b6d82c8bc479..02a9b89c10aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -264,9 +264,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, { struct mempolicy *policy; - pr_debug("setting mode %d flags %d nodes[0] %lx\n", - mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); - if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); @@ -768,11 +765,6 @@ static int vma_replace_policy(struct vm_area_struct *vma, vma_assert_write_locked(vma); - pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", - vma->vm_start, vma->vm_end, vma->vm_pgoff, - vma->vm_ops, vma->vm_file, - vma->vm_ops ? vma->vm_ops->set_policy : NULL); - new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); @@ -1273,10 +1265,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", - start, start + len, mode, mode_flags, - nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) lru_cache_disable(); { @@ -2496,8 +2484,6 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, - new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ @@ -2636,7 +2622,6 @@ void mpol_put_task_policy(struct task_struct *task) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } @@ -2793,12 +2778,6 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", - vma->vm_pgoff, - sz, npol ? npol->mode : -1, - npol ? npol->flags : -1, - npol ? nodes_addr(npol->nodes)[0] : NUMA_NO_NODE); - if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); if (!new) -- cgit v1.2.3 From c36f6e6dff4d32ec8b6da8f553933727a57a7a4a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:20:14 -0700 Subject: mempolicy trivia: slightly more consistent naming Before getting down to work, do a little cleanup, mainly of inconsistent variable naming. I gave up trying to rationalize mpol versus pol versus policy, and node versus nid, but let's avoid p and nd. Remove a few superfluous blank lines, but add one; and here prefer vma->vm_policy to vma_policy(vma) - the latter being appropriate in other sources, which have to allow for !CONFIG_NUMA. That intriguing line about KERNEL_DS? should have gone in v2.6.15, when numa_policy_init() stopped using set_mempolicy(2)'s system call handler. Link: https://lkml.kernel.org/r/68287974-b6ae-7df-4ba-d19ddd69cbf@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 11 ++++--- mm/mempolicy.c | 73 +++++++++++++++++++++-------------------------- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1d7f4ec2614c..a807976fe95d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -124,10 +124,9 @@ struct shared_policy { int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, - struct mempolicy *new); -void mpol_free_shared_policy(struct shared_policy *p); +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *mpol); +void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx); @@ -191,7 +190,7 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) return true; } -static inline void mpol_put(struct mempolicy *p) +static inline void mpol_put(struct mempolicy *pol) { } @@ -210,7 +209,7 @@ static inline void mpol_shared_policy_init(struct shared_policy *sp, { } -static inline void mpol_free_shared_policy(struct shared_policy *p) +static inline void mpol_free_shared_policy(struct shared_policy *sp) { } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 02a9b89c10aa..50a57f4c8013 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -25,7 +25,7 @@ * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * - * preferred Try a specific node first before normal fallback. + * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default @@ -52,7 +52,7 @@ * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * - * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between + * For shmem/tmpfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ @@ -291,6 +291,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); + policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); @@ -303,11 +304,11 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, } /* Slow path of a mpol destructor. */ -void __mpol_put(struct mempolicy *p) +void __mpol_put(struct mempolicy *pol) { - if (!atomic_dec_and_test(&p->refcnt)) + if (!atomic_dec_and_test(&pol->refcnt)) return; - kmem_cache_free(policy_cache, p); + kmem_cache_free(policy_cache, pol); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) @@ -364,7 +365,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * * Called with task's alloc_lock held. */ - void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); @@ -375,7 +375,6 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ - void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; @@ -757,7 +756,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, - struct mempolicy *pol) + struct mempolicy *pol) { int err; struct mempolicy *old; @@ -800,7 +799,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, vmstart = vma->vm_start; } - if (mpol_equal(vma_policy(vma), new_pol)) { + if (mpol_equal(vma->vm_policy, new_pol)) { *prev = vma; return 0; } @@ -855,18 +854,18 @@ out: * * Called with task's alloc_lock held */ -static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) +static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes) { nodes_clear(*nodes); - if (p == &default_policy) + if (pol == &default_policy) return; - switch (p->mode) { + switch (pol->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: case MPOL_PREFERRED: case MPOL_PREFERRED_MANY: - *nodes = p->nodes; + *nodes = pol->nodes; break; case MPOL_LOCAL: /* return empty node mask for local allocation */ @@ -1634,7 +1633,6 @@ out: out_put: put_task_struct(task); goto out; - } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, @@ -1644,7 +1642,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } - /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -1827,10 +1824,10 @@ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) * policy_node() is always coupled with policy_nodemask(), which * secures the nodemask limit for 'bind' and 'prefer-many' policy. */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) +static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) { if (policy->mode == MPOL_PREFERRED) { - nd = first_node(policy->nodes); + nid = first_node(policy->nodes); } else { /* * __GFP_THISNODE shouldn't even be used with the bind policy @@ -1845,19 +1842,18 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) policy->home_node != NUMA_NO_NODE) return policy->home_node; - return nd; + return nid; } /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static unsigned int interleave_nodes(struct mempolicy *policy) { - unsigned next; - struct task_struct *me = current; + unsigned int nid; - next = next_node_in(me->il_prev, policy->nodes); - if (next < MAX_NUMNODES) - me->il_prev = next; - return next; + nid = next_node_in(current->il_prev, policy->nodes); + if (nid < MAX_NUMNODES) + current->il_prev = nid; + return nid; } /* @@ -2347,7 +2343,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { - struct mempolicy *pol = mpol_dup(vma_policy(src)); + struct mempolicy *pol = mpol_dup(src->vm_policy); if (IS_ERR(pol)) return PTR_ERR(pol); @@ -2771,40 +2767,40 @@ put_mpol: } } -int mpol_set_shared_policy(struct shared_policy *info, - struct vm_area_struct *vma, struct mempolicy *npol) +int mpol_set_shared_policy(struct shared_policy *sp, + struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - if (npol) { - new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); + if (pol) { + new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol); if (!new) return -ENOMEM; } - err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); + err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ -void mpol_free_shared_policy(struct shared_policy *p) +void mpol_free_shared_policy(struct shared_policy *sp) { struct sp_node *n; struct rb_node *next; - if (!p->root.rb_node) + if (!sp->root.rb_node) return; - write_lock(&p->lock); - next = rb_first(&p->root); + write_lock(&sp->lock); + next = rb_first(&sp->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); - sp_delete(p, n); + sp_delete(sp, n); } - write_unlock(&p->lock); + write_unlock(&sp->lock); } #ifdef CONFIG_NUMA_BALANCING @@ -2854,7 +2850,6 @@ static inline void __init check_numabalancing_enable(void) } #endif /* CONFIG_NUMA_BALANCING */ -/* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; @@ -2917,7 +2912,6 @@ void numa_default_policy(void) /* * Parse and format mempolicy from/to strings */ - static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", @@ -2928,7 +2922,6 @@ static const char * const policy_modes[] = [MPOL_PREFERRED_MANY] = "prefer (many)", }; - #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. -- cgit v1.2.3 From 93397c3b7684555b7cec726cd13eef6742d191fe Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:21:34 -0700 Subject: mempolicy trivia: use pgoff_t in shared mempolicy tree Prefer the more explicit "pgoff_t" to "unsigned long" when dealing with a shared mempolicy tree. Delete confusing comment about pseudo mm vmas. Link: https://lkml.kernel.org/r/5451157-3818-4af5-fd2c-5d26a5d1dc53@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 20 +++++++------------- mm/mempolicy.c | 12 ++++++------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index a807976fe95d..acdb12fcb6cd 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -105,22 +105,16 @@ static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) /* * Tree of shared policies for a shared memory region. - * Maintain the policies in a pseudo mm that contains vmas. The vmas - * carry the policy. As a special twist the pseudo mm is indexed in pages, not - * bytes, so that we can work with shared memory segments bigger than - * unsigned long. */ - -struct sp_node { - struct rb_node nd; - unsigned long start, end; - struct mempolicy *policy; -}; - struct shared_policy { struct rb_root root; rwlock_t lock; }; +struct sp_node { + struct rb_node nd; + pgoff_t start, end; + struct mempolicy *policy; +}; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); @@ -128,7 +122,7 @@ int mpol_set_shared_policy(struct shared_policy *sp, struct vm_area_struct *vma, struct mempolicy *mpol); void mpol_free_shared_policy(struct shared_policy *sp); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, - unsigned long idx); + pgoff_t idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, @@ -214,7 +208,7 @@ static inline void mpol_free_shared_policy(struct shared_policy *sp) } static inline struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) { return NULL; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 50a57f4c8013..c18f0a1b97e3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2428,8 +2428,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ -static struct sp_node * -sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) +static struct sp_node *sp_lookup(struct shared_policy *sp, + pgoff_t start, pgoff_t end) { struct rb_node *n = sp->root.rb_node; @@ -2483,8 +2483,8 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } /* Find shared policy intersecting idx */ -struct mempolicy * -mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, + pgoff_t idx) { struct mempolicy *pol = NULL; struct sp_node *sn; @@ -2652,8 +2652,8 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end, } /* Replace a policy range. */ -static int shared_policy_replace(struct shared_policy *sp, unsigned long start, - unsigned long end, struct sp_node *new) +static int shared_policy_replace(struct shared_policy *sp, pgoff_t start, + pgoff_t end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; -- cgit v1.2.3 From 35ec8fa0207b3c7f7c3c22337c9a507d7b291626 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:22:59 -0700 Subject: mempolicy: mpol_shared_policy_init() without pseudo-vma mpol_shared_policy_init() does not need to use a pseudo-vma: it can use sp_alloc() and sp_insert() directly, since the object's shared policy tree is empty and inaccessible (needing no lock) at get_inode() time. Link: https://lkml.kernel.org/r/3bef62d8-ae78-4c2-533-56a44ae425c@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c18f0a1b97e3..b6461d7d200e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2736,30 +2736,30 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) rwlock_init(&sp->lock); if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; + struct sp_node *sn; + struct mempolicy *npol; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ - new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) + + /* contextualize the tmpfs mount point mempolicy to this file */ + npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); + if (IS_ERR(npol)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) - goto put_new; - - /* Create pseudo-vma that contains just the policy */ - vma_init(&pvma, NULL); - pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: - mpol_put(new); /* drop initial ref */ + goto put_npol; + + /* alloc node covering entire file; adds ref to file's npol */ + sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol); + if (sn) + sp_insert(sp, sn); +put_npol: + mpol_put(npol); /* drop initial ref on file's npol */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: -- cgit v1.2.3 From 2cafb582173f3870240af90de3f31d18b0728882 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:24:18 -0700 Subject: mempolicy: remove confusing MPOL_MF_LAZY dead code v3.8 commit b24f53a0bea3 ("mm: mempolicy: Add MPOL_MF_LAZY") introduced MPOL_MF_LAZY, and included it in the MPOL_MF_VALID flags; but a720094ded8 ("mm: mempolicy: Hide MPOL_NOOP and MPOL_MF_LAZY from userspace for now") immediately removed it from MPOL_MF_VALID flags, pending further review. "This will need to be revisited", but it has not been reinstated. The present state is confusing: there is dead code in mm/mempolicy.c to handle MPOL_MF_LAZY cases which can never occur. Remove that: it can be resurrected later if necessary. But keep the definition of MPOL_MF_LAZY, which must remain in the UAPI, even though it always fails with EINVAL. https://lore.kernel.org/linux-mm/1553041659-46787-1-git-send-email-yang.shi@linux.alibaba.com/ links to a previous request to remove MPOL_MF_LAZY. Link: https://lkml.kernel.org/r/80c9665c-1c3f-17ba-21a3-f6115cebf7d@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Yang Shi Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/uapi/linux/mempolicy.h | 2 +- mm/mempolicy.c | 18 ------------------ 2 files changed, 1 insertion(+), 19 deletions(-) diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h index 046d0ccba4cd..a8963f7ef4c2 100644 --- a/include/uapi/linux/mempolicy.h +++ b/include/uapi/linux/mempolicy.h @@ -48,7 +48,7 @@ enum { #define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to policy */ #define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */ -#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */ +#define MPOL_MF_LAZY (1<<3) /* UNSUPPORTED FLAG: Lazy migrate on fault */ #define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */ #define MPOL_MF_VALID (MPOL_MF_STRICT | \ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b6461d7d200e..e01d3f807f77 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -636,12 +636,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, @@ -680,14 +674,6 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (endvma > end) endvma = end; - if (flags & MPOL_MF_LAZY) { - /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && - !(vma->vm_flags & VM_MIXEDMAP)) - change_prot_numa(vma, start, endvma); - return 1; - } - /* * Check page nodes, and queue pages to move, in the current vma. * But if no moving, and no strict checking, the scan can be skipped. @@ -1254,9 +1240,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (IS_ERR(new)) return PTR_ERR(new); - if (flags & MPOL_MF_LAZY) - new->flags |= MPOL_F_MOF; - /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all @@ -1301,7 +1284,6 @@ static long do_mbind(unsigned long start, unsigned long len, if (!err) { if (!list_empty(&pagelist)) { - WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed |= migrate_pages(&pagelist, new_folio, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- cgit v1.2.3 From 23e4883248f0472d806c8b3422ba6257e67bf1a5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:25:33 -0700 Subject: mm: add page_rmappable_folio() wrapper folio_prep_large_rmappable() is being used repeatedly along with a conversion from page to folio, a check non-NULL, a check order > 1: wrap it all up into struct folio *page_rmappable_folio(struct page *). Link: https://lkml.kernel.org/r/8d92c6cf-eebe-748-e29c-c8ab224c741@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/internal.h | 9 +++++++++ mm/mempolicy.c | 17 +++-------------- mm/page_alloc.c | 8 ++------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 3eceae1ec4c0..b61034bd50f5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -415,6 +415,15 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) void folio_undo_large_rmappable(struct folio *folio); +static inline struct folio *page_rmappable_folio(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + if (folio && folio_order(folio) > 1) + folio_prep_large_rmappable(folio); + return folio; +} + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e01d3f807f77..596d580f92d1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2122,10 +2122,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (pol->mode == MPOL_PREFERRED_MANY) { @@ -2135,10 +2132,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { @@ -2232,12 +2226,7 @@ EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5f46c7f85cb1..733732e7e0ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4598,12 +4598,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page = __alloc_pages(gfp | __GFP_COMP, order, - preferred_nid, nodemask); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + preferred_nid, nodemask); + return page_rmappable_folio(page); } EXPORT_SYMBOL(__folio_alloc); -- cgit v1.2.3 From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 Oct 2023 13:39:08 -0700 Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Johannes Weiner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 5 +- include/linux/gfp.h | 10 +- include/linux/mempolicy.h | 20 ++- include/linux/mm.h | 2 +- ipc/shm.c | 21 +-- mm/mempolicy.c | 379 +++++++++++++++++++--------------------------- mm/shmem.c | 92 ++++++----- mm/swap.h | 9 +- mm/swap_state.c | 86 +++++++---- mm/zswap.c | 7 +- 10 files changed, 308 insertions(+), 323 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1d994505805b..66ae1c265da3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5b917e5b9350..de292a007138 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,7 @@ #include struct vm_area_struct; +struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); +struct folio *folio_alloc(gfp_t gfp, unsigned int order); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return alloc_pages(gfp, order); +} static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index acdb12fcb6cd..931b118336f4 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -17,6 +17,8 @@ struct mm_struct; +#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ + #ifdef CONFIG_NUMA /* @@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); @@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); struct mempolicy {}; +static inline struct mempolicy *get_task_policy(struct task_struct *p) +{ + return NULL; +} + static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; @@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) return NULL; } +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} + static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e496a8b8e50..14d5aaff96d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -619,7 +619,7 @@ struct vm_operations_struct { * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the diff --git a/ipc/shm.c b/ipc/shm.c index 576a543b7cff..222aaf035afb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 596d580f92d1..ded5508f0972 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -898,6 +898,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -909,10 +910,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -1170,6 +1168,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) break; } + /* + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL + * when the page can no longer be located in a vma: that is not ideal + * (migrate_pages() will give up early, presuming ENOMEM), but good + * enough to avoid a crash by syzkaller or concurrent holepunch. + */ + if (!vma) + return NULL; + if (folio_test_hugetlb(src)) { return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); @@ -1178,9 +1185,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) if (folio_test_large(src)) gfp = GFP_TRANSHUGE; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); } @@ -1690,34 +1694,19 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1726,14 +1715,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1743,8 +1736,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -1779,54 +1773,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) -{ - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; - - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} - -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) -{ - if (policy->mode == MPOL_PREFERRED) { - nid = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); - } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - return nid; -} - /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1886,11 +1832,11 @@ unsigned int mempolicy_slab_node(void) } /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; @@ -1908,33 +1854,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -1950,27 +1917,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2048,27 +2004,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2081,7 +2018,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); @@ -2089,55 +2026,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. - * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - return page_rmappable_folio(page); - } - - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask_t *nodemask; + struct page *page; - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - return page_rmappable_folio(page); - } + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2148,39 +2059,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_pages_node(nid, + gfp | __GFP_THISNODE | __GFP_NORETRY, order); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_pages(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: Unused (was: For hugepages try only preferred node if possible). + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of alloc_pages_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct page *page; + + pol = get_vma_policy(vma, addr, order, &ilx); + page = alloc_pages_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: - return folio; + return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); @@ -2198,33 +2138,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - return page; + return alloc_pages_mpol(gfp, order, + pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } @@ -2295,6 +2225,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); @@ -2307,9 +2239,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return __alloc_pages_bulk(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2496,23 +2429,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); - unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: diff --git a/mm/shmem.c b/mm/shmem.c index bcbe9dbb9f5f..a314a25aea8c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif - -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; + struct mempolicy *mpol; + pgoff_t ilx; struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); if (!page) return NULL; @@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); + page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return page_rmappable_folio(page); } static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return (struct folio *)page; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ - folio = shmem_swapin(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; @@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { diff --git a/mm/swap.h b/mm/swap.h index 8a3c7a0ace4f..73c332ee4d91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,8 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +struct mempolicy; + #ifdef CONFIG_SWAP #include /* for bio_end_io_t */ @@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, unsigned long addr, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); @@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) } static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) + gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index ab79ffb71736..85d9e5806a6a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si; struct folio *folio; @@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { - bool page_was_allocated; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - if (page_was_allocated) - swap_readpage(retpage, false, plug); + mpol = get_vma_policy(vma, addr, 0, &ilx); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); - return retpage; + if (page_allocated) + swap_readpage(page, false, plug); + return page; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct page for entry and addr, after queueing swapin. * @@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!page) continue; if (page_allocated) { @@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. @@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!page) continue; if (page_allocated) { @@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } /** @@ -853,9 +868,16 @@ skip: struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + page = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + return page; } #ifdef CONFIG_SYSFS diff --git a/mm/zswap.c b/mm/zswap.c index 37d2b1cb2ecb..060857adca76 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, { swp_entry_t swpentry = entry->swpentry; struct page *page; + struct mempolicy *mpol; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); @@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache page */ - page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, - &page_was_allocated); + mpol = get_task_policy(current); + page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &page_was_allocated); if (!page) { ret = -ENOMEM; goto fail; -- cgit v1.2.3 From 72e315f7a750281b4410ac30d8930f735459e72d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:27:47 -0700 Subject: mempolicy: mmap_lock is not needed while migrating folios mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout. They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address. Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page(). (After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.) migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it. It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern. Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 9 ------ mm/hugetlb.c | 38 +++++++++++----------- mm/mempolicy.c | 83 +++++++++++++++++++++++++------------------------ 3 files changed, 63 insertions(+), 67 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 158ff156710b..d3acecc5db4b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -748,8 +748,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1072,13 +1070,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; } -static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) -{ - return NULL; -} - static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dd8065e36038..1169ef2f2176 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2630,24 +2630,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); } -/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -6559,6 +6541,26 @@ out_mutex: } #ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ded5508f0972..0594362f2470 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -415,6 +415,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid); static bool strictly_unmovable(unsigned long flags) { @@ -1021,6 +1023,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, node_set(source, nmask); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0); /* @@ -1031,6 +1035,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1059,8 +1064,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, lru_cache_disable(); - mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1140,7 +1143,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - mmap_read_unlock(mm); lru_cache_enable(); if (err < 0) @@ -1149,44 +1151,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, } /* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + struct mempolicy *pol = (struct mempolicy *)private; + pgoff_t ilx = 0; /* improve on this later */ + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp; - /* - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL - * when the page can no longer be located in a vma: that is not ideal - * (migrate_pages() will give up early, presuming ENOMEM), but good - * enough to avoid a crash by syzkaller or concurrent holepunch. - */ - if (!vma) - return NULL; + order = folio_order(src); + ilx += src->index >> order; if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); } if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else @@ -1202,7 +1198,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1276,6 +1273,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (nr_failed < 0) { err = nr_failed; + nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); @@ -1286,19 +1284,24 @@ static long do_mbind(unsigned long start, unsigned long len, } } - if (!err) { - if (!list_empty(&pagelist)) { - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)new, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); } + if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - - mmap_write_unlock(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) -- cgit v1.2.3 From 88c91dc58582f1d0c6f5f501051b0262d06ec4ed Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:29:00 -0700 Subject: mempolicy: migration attempt to match interleave nodes Improve alloc_migration_target_by_mpol()'s treatment of MPOL_INTERLEAVE. Make an effort in do_mbind(), to identify the correct interleave index for the first page to be migrated, so that it and all subsequent pages from the same vma will be targeted to precisely their intended nodes. Pages from following vmas will still be interleaved from the requested nodemask, but perhaps starting from a different base. Whether this is worth doing at all, or worth improving further, is arguable: queue_folio_required() is right not to care about the precise placement on interleaved nodes; but this little effort seems appropriate. [hughd@google.com: do vma_iter search under mmap_write_unlock()] Link: https://lkml.kernel.org/r/3311d544-fb05-a7f1-1b74-16aa0f6cd4fe@google.com Link: https://lkml.kernel.org/r/77954a5-9c9b-1c11-7d5c-3262c01b895f@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/mempolicy.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0594362f2470..5e472e6e0507 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -428,6 +428,11 @@ static bool strictly_unmovable(unsigned long flags) MPOL_MF_STRICT; } +struct migration_mpol { /* for alloc_migration_target_by_mpol() */ + struct mempolicy *pol; + pgoff_t ilx; +}; + struct queue_pages { struct list_head *pagelist; unsigned long flags; @@ -1156,8 +1161,9 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, static struct folio *alloc_migration_target_by_mpol(struct folio *src, unsigned long private) { - struct mempolicy *pol = (struct mempolicy *)private; - pgoff_t ilx = 0; /* improve on this later */ + struct migration_mpol *mmpol = (struct migration_mpol *)private; + struct mempolicy *pol = mmpol->pol; + pgoff_t ilx = mmpol->ilx; struct page *page; unsigned int order; int nid = numa_node_id(); @@ -1212,6 +1218,7 @@ static long do_mbind(unsigned long start, unsigned long len, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; struct vma_iterator vmi; + struct migration_mpol mmpol; struct mempolicy *new; unsigned long end; long err; @@ -1284,17 +1291,55 @@ static long do_mbind(unsigned long start, unsigned long len, } } - mmap_write_unlock(mm); - if (!err && !list_empty(&pagelist)) { /* Convert MPOL_DEFAULT's NULL to task or default policy */ if (!new) { new = get_task_policy(current); mpol_get(new); } + mmpol.pol = new; + mmpol.ilx = 0; + + /* + * In the interleaved case, attempt to allocate on exactly the + * targeted nodes, for the first VMA to be migrated; for later + * VMAs, the nodes will still be interleaved from the targeted + * nodemask, but one by one may be selected differently. + */ + if (new->mode == MPOL_INTERLEAVE) { + struct page *page; + unsigned int order; + unsigned long addr = -EFAULT; + + list_for_each_entry(page, &pagelist, lru) { + if (!PageKsm(page)) + break; + } + if (!list_entry_is_head(page, &pagelist, lru)) { + vma_iter_init(&vmi, mm, start); + for_each_vma_range(vmi, vma, end) { + addr = page_address_in_vma(page, vma); + if (addr != -EFAULT) + break; + } + } + if (addr != -EFAULT) { + order = compound_order(page); + /* We already know the pol, but not the ilx */ + mpol_cond_put(get_vma_policy(vma, addr, order, + &mmpol.ilx)); + /* Set base from which to increment by index */ + mmpol.ilx -= page->index >> order; + } + } + } + + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { nr_failed |= migrate_pages(&pagelist, alloc_migration_target_by_mpol, NULL, - (unsigned long)new, MIGRATE_SYNC, + (unsigned long)&mmpol, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); } -- cgit v1.2.3 From 9fb2047d23d5087e124e7bc7a0abdc3f0d7f5d7e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 24 Oct 2023 17:37:50 +0200 Subject: Documentation: ubsan: drop "the" from article title Drop "the" from the title of the documentation article for UBSAN, as it is redundant. Also add SPDX-License-Identifier for ubsan.rst. Link: https://lkml.kernel.org/r/5fb11a4743eea9d9232a5284dea0716589088fec.1698161845.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- Documentation/dev-tools/ubsan.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/dev-tools/ubsan.rst b/Documentation/dev-tools/ubsan.rst index 1be6618e232d..2de7c63415da 100644 --- a/Documentation/dev-tools/ubsan.rst +++ b/Documentation/dev-tools/ubsan.rst @@ -1,5 +1,7 @@ -The Undefined Behavior Sanitizer - UBSAN -======================================== +.. SPDX-License-Identifier: GPL-2.0 + +Undefined Behavior Sanitizer - UBSAN +==================================== UBSAN is a runtime undefined behaviour checker. -- cgit v1.2.3 From cb61dad80fdc6a8f01b101e823a9335cd6d61071 Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Tue, 24 Oct 2023 16:45:09 -0700 Subject: zswap: export compression failure stats During a zswap store attempt, the compression algorithm could fail (for e.g due to the page containing incompressible random data). This is not tracked in any of existing zswap counters, making it hard to monitor for and investigate. We have run into this problem several times in our internal investigations on zswap store failures. This patch adds a dedicated debugfs counter for compression algorithm failures. Link: https://lkml.kernel.org/r/20231024234509.2680539-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 060857adca76..74411dfdad92 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -62,6 +62,8 @@ static u64 zswap_pool_limit_hit; static u64 zswap_written_back_pages; /* Store failed due to a reclaim failure after pool limit was reached */ static u64 zswap_reject_reclaim_fail; +/* Store failed due to compression algorithm failure */ +static u64 zswap_reject_compress_fail; /* Compressed page was too big for the allocator to (optimally) store */ static u64 zswap_reject_compress_poor; /* Store failed because underlying allocator could not get memory */ @@ -1312,8 +1314,10 @@ bool zswap_store(struct folio *folio) ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) + if (ret) { + zswap_reject_compress_fail++; goto put_dstmem; + } /* store */ zpool = zswap_find_zpool(entry); @@ -1553,6 +1557,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_reject_alloc_fail); debugfs_create_u64("reject_kmemcache_fail", 0444, zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_fail", 0444, + zswap_debugfs_root, &zswap_reject_compress_fail); debugfs_create_u64("reject_compress_poor", 0444, zswap_debugfs_root, &zswap_reject_compress_poor); debugfs_create_u64("written_back_pages", 0444, -- cgit v1.2.3 From ca6c2ce1b481996ae5b16e4589d3c0dd61899fa8 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 18 Oct 2023 22:50:14 +0800 Subject: mm/vmalloc: fix the unchecked dereference warning in vread_iter() LKP reported smatch warning as below: =================== smatch warnings: mm/vmalloc.c:3689 vread_iter() error: we previously assumed 'vm' could be null (see line 3667) ...... 06c8994626d1b7 @3667 size = vm ? get_vm_area_size(vm) : va_size(va); ...... 06c8994626d1b7 @3689 else if (!(vm->flags & VM_IOREMAP)) ^^^^^^^^^ Unchecked dereference ===================== This is not a runtime bug because the possible null 'vm' in the pointed place could only happen when flags == VMAP_BLOCK. However, the case 'flags == VMAP_BLOCK' should never happen and has been detected with WARN_ON. Please check vm_map_ram() implementation and the earlier checking in vread_iter() at below: ~~~~~~~~~~~~~~~~~~~~~~~~~~ /* * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need * be set together with VMAP_RAM. */ WARN_ON(flags == VMAP_BLOCK); if (!vm && !flags) continue; ~~~~~~~~~~~~~~~~~~~~~~~~~~ So add checking on whether 'vm' could be null when dereferencing it in vread_iter(). This mutes smatch complaint. Link: https://lkml.kernel.org/r/ZTCURc8ZQE+KrTvS@MiWiFi-R3L-srv Link: https://lkml.kernel.org/r/ZS/2k6DIMd0tZRgK@MiWiFi-R3L-srv Signed-off-by: Baoquan He Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202310171600.WCrsOwFj-lkp@intel.com/ Cc: Lorenzo Stoakes Cc: Philip Li Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a3fedb3ee0db..d12a17fc0c17 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3809,7 +3809,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if (flags & VMAP_RAM) copied = vmap_ram_vread_iter(iter, addr, n, flags); - else if (!(vm->flags & VM_IOREMAP)) + else if (!(vm && (vm->flags & VM_IOREMAP))) copied = aligned_vread_iter(iter, addr, n); else /* IOREMAP area is treated as memory hole */ copied = zero_iter(iter, n); -- cgit v1.2.3 From 9e1b016a0bc97112a18427bce1b6849d28c6eb91 Mon Sep 17 00:00:00 2001 From: Tom Yang Date: Mon, 23 Oct 2023 17:57:37 +0800 Subject: Documentation: maple_tree: fix word spelling error The "first" is spelled "fist". Link: https://lkml.kernel.org/r/20231023095737.21823-1-yangqixiao@inspur.com Signed-off-by: Tom Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- Documentation/core-api/maple_tree.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/core-api/maple_tree.rst b/Documentation/core-api/maple_tree.rst index 45defcf15da7..96f3d5f076b5 100644 --- a/Documentation/core-api/maple_tree.rst +++ b/Documentation/core-api/maple_tree.rst @@ -175,7 +175,7 @@ will return the previous entry which occurs before the entry at index. mas_find() will find the first entry which exists at or above index on the first call, and the next entry from every subsequent calls. -mas_find_rev() will find the fist entry which exists at or below the last on +mas_find_rev() will find the first entry which exists at or below the last on the first call, and the previous entry from every subsequent calls. If the user needs to yield the lock during an operation, then the maple state -- cgit v1.2.3 From 6479b29203dedb724f4f4e08e1cc0dcf432c333b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Fri, 20 Oct 2023 15:20:09 -0700 Subject: selftests: add a sanity check for zswap We recently encountered a bug that makes all zswap store attempt fail. Specifically, after: "141fdeececb3 mm/zswap: delay the initialization of zswap" if we build a kernel with zswap disabled by default, then enabled after the swapfile is set up, the zswap tree will not be initialized. As a result, all zswap store calls will be short-circuited. We have to perform another swapon to get zswap working properly again. Fortunately, this issue has since been fixed by the patch that kills frontswap: "42c06a0e8ebe mm: kill frontswap" which performs zswap_swapon() unconditionally, i.e always initializing the zswap tree. This test add a sanity check that ensure zswap storing works as intended. Link: https://lkml.kernel.org/r/20231020222009.2358953-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Acked-by: Rik van Riel Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 48 +++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 49def87a909b..c99d2adaca3f 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -55,6 +55,11 @@ static int get_zswap_written_back_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/written_back_pages", value); } +static long get_zswpout(const char *cgroup) +{ + return cg_read_key_long(cgroup, "memory.stat", "zswpout "); +} + static int allocate_bytes(const char *cgroup, void *arg) { size_t size = (size_t)arg; @@ -68,6 +73,48 @@ static int allocate_bytes(const char *cgroup, void *arg) return 0; } +/* + * Sanity test to check that pages are written into zswap. + */ +static int test_zswap_usage(const char *root) +{ + long zswpout_before, zswpout_after; + int ret = KSFT_FAIL; + char *test_group; + + /* Set up */ + test_group = cg_name(root, "no_shrink_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "1M")) + goto out; + + zswpout_before = get_zswpout(test_group); + if (zswpout_before < 0) { + ksft_print_msg("Failed to get zswpout\n"); + goto out; + } + + /* Allocate more than memory.max to push memory into zswap */ + if (cg_run(test_group, allocate_bytes, (void *)MB(4))) + goto out; + + /* Verify that pages come into zswap */ + zswpout_after = get_zswpout(test_group); + if (zswpout_after <= zswpout_before) { + ksft_print_msg("zswpout does not increase after test program\n"); + goto out; + } + ret = KSFT_PASS; + +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + /* * When trying to store a memcg page in zswap, if the memcg hits its memory * limit in zswap, writeback should not be triggered. @@ -235,6 +282,7 @@ struct zswap_test { int (*fn)(const char *root); const char *name; } tests[] = { + T(test_zswap_usage), T(test_no_kmem_bypass), T(test_no_invasive_cgroup_shrink), }; -- cgit v1.2.3 From 19467a950b49432a84bf6dbadbbb17bdf89418b7 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sun, 22 Oct 2023 21:07:33 +0000 Subject: mm/damon/sysfs: remove requested targets when online-commit inputs damon_sysfs_set_targets(), which updates the targets of the context for online commitment, do not remove targets that removed from the corresponding sysfs files. As a result, more than intended targets of the context can exist and hence consume memory and monitoring CPU resource more than expected. Fix it by removing all targets of the context and fill up again using the user input. This could cause unnecessary memory dealloc and realloc operations, but this is not a hot code path. Also, note that damon_target is stateless, and hence no data is lost. [sj@kernel.org: fix unnecessary monitoring results removal] Link: https://lkml.kernel.org/r/20231028213353.45397-1-sj@kernel.org Link: https://lkml.kernel.org/r/20231022210735.46409-2-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: [5.19.x] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 70 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 5846970cedce..1a231bde18f9 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,58 +1150,60 @@ destroy_targets_out: return err; } -/* - * Search a target in a context that corresponds to the sysfs target input. - * - * Return: pointer to the target if found, NULL if not found, or negative - * error code if the search failed. - */ -static struct damon_target *damon_sysfs_existing_target( - struct damon_sysfs_target *sys_target, struct damon_ctx *ctx) +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) { struct pid *pid; - struct damon_target *t; + struct damon_region *r, *next; - if (!damon_target_has_pid(ctx)) { - /* Up to only one target for paddr could exist */ - damon_for_each_target(t, ctx) - return t; - return NULL; - } + if (!damon_target_has_pid(ctx)) + return 0; - /* ops.id should be DAMON_OPS_VADDR or DAMON_OPS_FVADDR */ pid = find_get_pid(sys_target->pid); if (!pid) - return ERR_PTR(-EINVAL); - damon_for_each_target(t, ctx) { - if (t->pid == pid) { - put_pid(pid); - return t; - } + return -EINVAL; + + /* no change to the target */ + if (pid == target->pid) { + put_pid(pid); + return 0; } - put_pid(pid); - return NULL; + + /* remove old monitoring results and update the target's pid */ + damon_for_each_region_safe(r, next, target) + damon_destroy_region(r, target); + put_pid(target->pid); + target->pid = pid; + return 0; } static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { - int i, err; + struct damon_target *t, *next; + int i = 0, err; /* Multiple physical address space monitoring targets makes no sense */ if (ctx->ops.id == DAMON_OPS_PADDR && sysfs_targets->nr > 1) return -EINVAL; - for (i = 0; i < sysfs_targets->nr; i++) { + damon_for_each_target_safe(t, next, ctx) { + if (i < sysfs_targets->nr) { + damon_sysfs_update_target(t, ctx, + sysfs_targets->targets_arr[i]); + } else { + if (damon_target_has_pid(ctx)) + put_pid(t->pid); + damon_destroy_target(t); + } + i++; + } + + for (; i < sysfs_targets->nr; i++) { struct damon_sysfs_target *st = sysfs_targets->targets_arr[i]; - struct damon_target *t = damon_sysfs_existing_target(st, ctx); - - if (IS_ERR(t)) - return PTR_ERR(t); - if (!t) - err = damon_sysfs_add_target(st, ctx); - else - err = damon_sysfs_set_regions(t, st->regions); + + err = damon_sysfs_add_target(st, ctx); if (err) return err; } -- cgit v1.2.3 From 9732336006764e2ee61225387e3c70eae9139035 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 31 Oct 2023 17:01:31 +0000 Subject: mm/damon/sysfs: update monitoring target regions for online input commit When user input is committed online, DAMON sysfs interface is ignoring the user input for the monitoring target regions. Such request is valid and useful for fixed monitoring target regions-based monitoring ops like 'paddr' or 'fvaddr'. Update the region boundaries as user specified, too. Note that the monitoring results of the regions that overlap between the latest monitoring target regions and the new target regions are preserved. Treat empty monitoring target regions user request as a request to just make no change to the monitoring target regions. Otherwise, users should set the monitoring target regions same to current one for every online input commit, and it could be challenging for dynamic monitoring target regions update DAMON ops like 'vaddr'. If the user really need to remove all monitoring target regions, they can simply remove the target and then create the target again with empty target regions. Link: https://lkml.kernel.org/r/20231031170131.46972-1-sj@kernel.org Fixes: da87878010e5 ("mm/damon/sysfs: support online inputs update") Signed-off-by: SeongJae Park Cc: [5.19+] Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1a231bde18f9..e27846708b5a 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1150,34 +1150,47 @@ destroy_targets_out: return err; } -static int damon_sysfs_update_target(struct damon_target *target, - struct damon_ctx *ctx, - struct damon_sysfs_target *sys_target) +static int damon_sysfs_update_target_pid(struct damon_target *target, int pid) { - struct pid *pid; - struct damon_region *r, *next; - - if (!damon_target_has_pid(ctx)) - return 0; + struct pid *pid_new; - pid = find_get_pid(sys_target->pid); - if (!pid) + pid_new = find_get_pid(pid); + if (!pid_new) return -EINVAL; - /* no change to the target */ - if (pid == target->pid) { - put_pid(pid); + if (pid_new == target->pid) { + put_pid(pid_new); return 0; } - /* remove old monitoring results and update the target's pid */ - damon_for_each_region_safe(r, next, target) - damon_destroy_region(r, target); put_pid(target->pid); - target->pid = pid; + target->pid = pid_new; return 0; } +static int damon_sysfs_update_target(struct damon_target *target, + struct damon_ctx *ctx, + struct damon_sysfs_target *sys_target) +{ + int err; + + if (damon_target_has_pid(ctx)) { + err = damon_sysfs_update_target_pid(target, sys_target->pid); + if (err) + return err; + } + + /* + * Do monitoring target region boundary update only if one or more + * regions are set by the user. This is for keeping current monitoring + * target results and range easier, especially for dynamic monitoring + * target regions update ops like 'vaddr'. + */ + if (sys_target->regions->nr) + err = damon_sysfs_set_regions(target, sys_target->regions); + return err; +} + static int damon_sysfs_set_targets(struct damon_ctx *ctx, struct damon_sysfs_targets *sysfs_targets) { -- cgit v1.2.3