From 6212eb4d7a636bdfe0c11c84aa90db3fb5e6a0ff Mon Sep 17 00:00:00 2001 From: Hongbo Li Date: Mon, 8 Jan 2024 12:48:15 +0800 Subject: mm/filemap: avoid type conversion The return type of function folio_test_hugetlb is bool type, there is no need to assign it to an integer type. Link: https://lkml.kernel.org/r/20240108044815.3291487-1-lihongbo22@huawei.com Signed-off-by: Hongbo Li Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index 750e779c23db..0d7e20edf46f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -843,7 +843,7 @@ noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) { XA_STATE(xas, &mapping->i_pages, index); - int huge = folio_test_hugetlb(folio); + bool huge = folio_test_hugetlb(folio); bool charged = false; long nr = 1; -- cgit v1.2.3 From 5662400a9ac03f38ef3b84e4ff9a640a4604bef9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 11 Jan 2024 15:24:20 +0000 Subject: mm: add pfn_swap_entry_folio() Patch series "mm: convert mm counter to take a folio", v3. Make sure all mm_counter() and mm_counter_file() callers have a folio, then convert mm counter functions to take a folio, which saves some compound_head() calls. This patch (of 10): Thanks to the compound_head() hidden inside PageLocked(), this saves a call to compound_head() over calling page_folio(pfn_swap_entry_to_page()) Link: https://lkml.kernel.org/r/20240111152429.3374566-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240111152429.3374566-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/swapops.h | 13 +++++++++++++ mm/filemap.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) (limited to 'mm/filemap.c') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..48b700ba1d18 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -468,6 +468,19 @@ static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) return p; } +static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry) +{ + struct folio *folio = pfn_folio(swp_offset_pfn(entry)); + + /* + * Any use of migration entries may only occur while the + * corresponding folio is locked + */ + BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio)); + + return folio; +} + /* * A pfn swap entry is a special type of swap entry that always has a pfn stored * in the swap offset. They are used to represent unaddressable device memory diff --git a/mm/filemap.c b/mm/filemap.c index 0d7e20edf46f..142864338ca4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1354,7 +1354,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 94c958f7ebb5..5468b2f97cbf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2045,7 +2045,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); - struct folio *folio = page_folio(pfn_swap_entry_to_page(entry)); + struct folio *folio = pfn_swap_entry_folio(entry); pmd_t newpmd; VM_BUG_ON(!is_pmd_migration_entry(*pmd)); -- cgit v1.2.3 From 8897277acfef7f70fdecc054073bea2542fc7a1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 26 Feb 2024 15:55:28 -0500 Subject: mm: support order-1 folios in the page cache Folios of order 1 have no space to store the deferred list. This is not a problem for the page cache as file-backed folios are never placed on the deferred list. All we need to do is prevent the core MM from touching the deferred list for order 1 folios and remove the code which prevented us from allocating order 1 folios. Link: https://lore.kernel.org/linux-mm/90344ea7-4eec-47ee-5996-0c22f42d6a6a@google.com/ Link: https://lkml.kernel.org/r/20240226205534.1603748-3-zi.yan@sent.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Zi Yan Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/filemap.c | 2 -- mm/huge_memory.c | 19 +++++++++++++++---- mm/internal.h | 3 +-- mm/readahead.c | 3 --- 4 files changed, 16 insertions(+), 11 deletions(-) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b7a21551fbc7..b4858d89f1b1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1912,8 +1912,6 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b20e535e874c..9840f312c08f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -790,8 +790,10 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) void folio_prep_large_rmappable(struct folio *folio) { - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - INIT_LIST_HEAD(&folio->_deferred_list); + if (!folio || !folio_test_large(folio)) + return; + if (folio_order(folio) > 1) + INIT_LIST_HEAD(&folio->_deferred_list); folio_set_large_rmappable(folio); } @@ -3114,7 +3116,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { + if (folio_order(folio) > 1 && + !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; list_del(&folio->_deferred_list); } @@ -3165,6 +3168,9 @@ void folio_undo_large_rmappable(struct folio *folio) struct deferred_split *ds_queue; unsigned long flags; + if (folio_order(folio) <= 1) + return; + /* * At this point, there is no one trying to add the folio to * deferred_list. If folio is not in deferred_list, it's safe @@ -3190,7 +3196,12 @@ void deferred_split_folio(struct folio *folio) #endif unsigned long flags; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + /* + * Order 1 folios have no space for a deferred list, but we also + * won't waste much memory by not adding them to the deferred list. + */ + if (folio_order(folio) <= 1) + return; /* * The try_to_unmap() in page reclaim path might reach here too, diff --git a/mm/internal.h b/mm/internal.h index cb4eabb1051d..f376e3afbc4c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -420,8 +420,7 @@ static inline struct folio *page_rmappable_folio(struct page *page) { struct folio *folio = (struct folio *)page; - if (folio && folio_order(folio) > 1) - folio_prep_large_rmappable(folio); + folio_prep_large_rmappable(folio); return folio; } diff --git a/mm/readahead.c b/mm/readahead.c index 1e74455f908e..130c0e7df99f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -514,9 +514,6 @@ void page_cache_ra_order(struct readahead_control *ractl, /* Don't allocate pages past EOF */ while (index + (1UL << order) - 1 > limit) order--; - /* THP machinery does not support order-1 */ - if (order == 1) - order = 0; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; -- cgit v1.2.3 From 58f327f2ce80f9c7b4a70e9cf017ae8810d44a20 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 6 Mar 2024 16:38:09 +0800 Subject: filemap: avoid unnecessary major faults in filemap_fault() A major fault occurred when using mlockall(MCL_CURRENT | MCL_FUTURE) in application, which leading to an unexpected issue[1]. This is caused by temporarily cleared PTE during a read+clear/modify/write update of the PTE, eg, do_numa_page()/change_pte_range(). For the data segment of the user-mode program, the global variable area is a private mapping. After the pagecache is loaded, the private anonymous page is generated after the COW is triggered. Mlockall can lock COW pages (anonymous pages), but the original file pages cannot be locked and may be reclaimed. If the global variable (private anon page) is accessed when vmf->pte is zeroed in numa fault, a file page fault will be triggered. At this time, the original private file page may have been reclaimed. If the page cache is not available at this time, a major fault will be triggered and the file will be read, causing additional overhead. This issue affects our traffic analysis service. The inbound traffic is heavy. If a major fault occurs, the I/O schedule is triggered and the original I/O is suspended. Generally, the I/O schedule is 0.7 ms. If other applications are operating disks, the system needs to wait for more than 10 ms. However, the inbound traffic is heavy and the NIC buffer is small. As a result, packet loss occurs. But the traffic analysis service can't tolerate packet loss. Fix this by holding PTL and rechecking the PTE in filemap_fault() before triggering a major fault. We do this check only if vma is VM_LOCKED to reduce the performance impact in common scenarios. In our product environment, there were 7 major faults every 12 hours. After the patch is applied, no major fault have been triggered. Testing file page read and write page fault performance in ext4 and ramdisk using will-it-scale[2] on a x86 physical machine. The data is the average change compared with the mainline after the patch is applied. The test results are within the range of fluctuation. We do this check only if vma is VM_LOCKED, therefore, no performance regressions is caused for most common cases. The test results are as follows: processes processes_idle threads threads_idle ext4 private file write: 0.22% 0.26% 1.21% -0.15% ext4 private file read: 0.03% 1.00% 1.39% 0.34% ext4 shared file write: -0.50% -0.02% -0.14% -0.02% ramdisk private file write: 0.07% 0.02% 0.53% 0.04% ramdisk private file read: 0.01% 1.60% -0.32% -0.02% [1] https://lore.kernel.org/linux-mm/9e62fd9a-bee0-52bf-50a7-498fa17434ee@huawei.com/ [2] https://github.com/antonblanchard/will-it-scale/ Link: https://lkml.kernel.org/r/20240306083809.1236634-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Signed-off-by: Kefeng Wang Suggested-by: "Huang, Ying" Suggested-by: David Hildenbrand Reviewed-by: "Huang, Ying" Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/filemap.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'mm/filemap.c') diff --git a/mm/filemap.c b/mm/filemap.c index b4858d89f1b1..31ab455c4537 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3181,6 +3181,48 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf, return fpin; } +static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + vm_fault_t ret = 0; + pte_t *ptep; + + /* + * We might have COW'ed a pagecache folio and might now have an mlocked + * anon folio mapped. The original pagecache folio is not mlocked and + * might have been evicted. During a read+clear/modify/write update of + * the PTE, such as done in do_numa_page()/change_pte_range(), we + * temporarily clear the PTE under PT lock and might detect it here as + * "none" when not holding the PT lock. + * + * Not rechecking the PTE under PT lock could result in an unexpected + * major fault in an mlock'ed region. Recheck only for this special + * scenario while holding the PT lock, to not degrade non-mlocked + * scenarios. Recheck the PTE without PT lock firstly, thereby reducing + * the number of times we hold PT lock. + */ + if (!(vma->vm_flags & VM_LOCKED)) + return 0; + + if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) + return 0; + + ptep = pte_offset_map(vmf->pmd, vmf->address); + if (unlikely(!ptep)) + return VM_FAULT_NOPAGE; + + if (unlikely(!pte_none(ptep_get_lockless(ptep)))) { + ret = VM_FAULT_NOPAGE; + } else { + spin_lock(vmf->ptl); + if (unlikely(!pte_none(ptep_get(ptep)))) + ret = VM_FAULT_NOPAGE; + spin_unlock(vmf->ptl); + } + pte_unmap(ptep); + return ret; +} + /** * filemap_fault - read in file data for page fault handling * @vmf: struct vm_fault containing details of the fault @@ -3236,6 +3278,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) mapping_locked = true; } } else { + ret = filemap_fault_recheck_pte_none(vmf); + if (unlikely(ret)) + return ret; + /* No page in the page cache at all */ count_vm_event(PGMAJFAULT); count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); -- cgit v1.2.3